Ejemplo n.º 1
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)
    
    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')
    
    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])
        
        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]
 
        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun)
        
        all_clf_topic[year] = clf_topic
    
    return all_clf_topic
def load_sentences(fname, nb_sentences=None):
    """
    :param nb_sentences: Use if all brown sentences are too many
    :return: index2word (list of string)
    """
    from gensim.models import word2vec

    print 'building vocab ...'

    data_iterator = dataFromFile(fname)
    i = 0
    sents = []
    for line in data_iterator:
        #    line = line.encode('utf-8')
        line = line.rstrip('\n')
        sents.append(line.split(' '))
        i += 1
        if nb_sentences is not None:
            if nb_sentences == i:
                break
    print "load", i, "sentences"
    # I use gensim model only for building vocab
    model = word2vec.Word2Vec()
    model.build_vocab(sents)
    vocab = model.wv.vocab
    print "vocabulary size is", len(vocab)

    # ids: list of (list of word-id)
    ids = [[
        vocab[w].index for w in sent
        if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32
    ] for sent in sents]

    #    save_object('sentences.pkl', ids)
    #    save_object('index2word.pkl', model.wv.index2word)
    return ids, model.wv.index2word
Ejemplo n.º 3
0
def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict,
                                    fun):

    probFiles = fileSys.traverseDirectory(probDir)
    topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1)
    classFiles = fileSys.traverseDirectory(classDir)

    N = len(probFiles)
    if len(topicFiles) != N or len(classFiles) != N:
        print "numbers of files are not same"
        sys.exit('System will exit')

    all_clf_topic = {}
    if fun == 0:
        irange = range(0, N)
    # acm-class start from 1998
    elif fun == 1:
        irange = range(5, N)
    for i in irange:
        prob = ioFile.load_object(probFiles[i])
        topics = ioFile.load_object(topicFiles[i])
        inFile = ioFile.dataFromFile(classFiles[i])

        year = probFiles[i][-8:-4]
        topic_index = np.squeeze(np.array(prob.argmax(1)))
        doc_topic = topic_index
        #doc_topic = []
        #[doc_topic.append(' '.join(topics[index])) for index in topic_index]

        all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun)
        clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict,
                                          doc_topic, fun)

        all_clf_topic[year] = clf_topic

    return all_clf_topic
Ejemplo n.º 4
0
    return cooccurrences

def online_generator(cooccurrences, min_count=None):
    # Now yield our tuple sequence (dig into the LiL-matrix internals to
    # quickly iterate through all nonzero cells)
    for i, (row, data) in enumerate(izip(cooccurrences.rows,
        if min_count is not None and vocab[id2word[i]][1] < min_count:
            continue                                                   cooccurrences.data)):
        for data_idx, j in enumerate(row):
            if min_count is not None and vocab[id2word[j]][1] < min_count:
                continue
            yield ([np.array([i]), np.array([j])], np.array([data[data_idx]]))


# load data
data_iterator = dataFromFile('../ganyan_sentence_clean.txt')
vocab, sentences = build_vocab(data_iterator)

# params
nb_epoch = 3
vec_dim = 50
window_size = 5
vocab_size = len(vocab)
samples_per_epoch = len(sentences)

# create input
coocurrences = build_cooccur(vocab, sentences, window_size)


# graph definition (pvt: center of window, ctx: context)
input_pvt = Input(shape=(1,), dtype='int32')
Ejemplo n.º 5
0
                         default=None)
    optparser.add_option('-t', '--vocabularyFile',
                         dest='vocabulary',
                         help='fileName',
                         default=None)
    optparser.add_option('-o', '--outputFile',
                         dest='output',
                         help='fileName',
                         default=None)
    
    (options, args) = optparser.parse_args()
    
    if options.abstract is None:
            inFile = sys.stdin
    elif options.abstract is not None:
            inFile = ioFile.dataFromFile(options.abstract)
    else:
            print 'No abstract filename specified, system with exit\n'
            sys.exit('System will exit')

    if options.vocabulary is None:
            fname = sys.stdin
    elif options.vocabulary is not None:
            fname = options.vocabulary
    else:
            print 'No vocabulary filename specified, system with exit\n'
            sys.exit('System will exit')
            
    if options.output is None:
            outFile = 'foo-mult.dat'
    elif options.output is not None:
Ejemplo n.º 6
0
                         '--inputFile',
                         dest='input',
                         help='filename containing txt',
                         default=None)
    optparser.add_option('-o',
                         '--outputFile',
                         dest='output',
                         help='fileName',
                         default=None)

    (options, args) = optparser.parse_args()

    if options.input is None:
        inFile = sys.stdin
    elif options.input is not None:
        inFile = ioFile.dataFromFile(options.input)
    else:
        print 'No filename specified, system with exit\n'
        sys.exit('System will exit')

    if options.output is None:
        outFile = "arxiv-category_dict.pkl"
    elif options.output is not None:
        outFile = options.output

    data_iterator = inFile
    category_dict = {}

    for line in data_iterator:
        line = line.rstrip('\n')
        line = line.split('\t')
Ejemplo n.º 7
0
    optparser = OptionParser()
    optparser.add_option('-f', '--inputFile',
                         dest='input',
                         help='filename containing txt',
                         default=None)
    optparser.add_option('-o', '--outputFile',
                         dest='output',
                         help='fileName',
                         default=None)    

    (options, args) = optparser.parse_args()
    
    if options.input is None:
            inFile = sys.stdin
    elif options.input is not None:
            inFile = ioFile.dataFromFile(options.input)
    else:
            print 'No filename specified, system with exit\n'
            sys.exit('System will exit')
            
    if options.output is None:
            outFile = "arxiv-category_dict.pkl"
    elif options.output is not None:
            outFile = options.output
             
    data_iterator = inFile
    category_dict = {}
    
    for line in data_iterator:
        line = line.rstrip('\n')
        line = line.split('\t')
Ejemplo n.º 8
0
    optparser.add_option('-d', '--classDictName',
                         dest='clf_dict',
                         help='fileName',
                         default=None)     
    optparser.add_option('-o', '--outputFile',
                         dest='output',
                         help='fileName',
                         default=None)
        
    (options, args) = optparser.parse_args()
    
    if options.input is None:
            print 'No text filename specified, system with exit\n'
            sys.exit('System will exit')
    elif options.input is not None:
            inFile = ioFile.dataFromFile(options.input)

    if options.reference is None:
            print 'No reference filename specified, system with exit\n'
            sys.exit('System will exit')
    elif options.reference is not None:
            inFile_ref = ioFile.dataFromFile(options.reference)
           
    if options.class_name is None:
            print 'No name of the category specified, system with exit\n'
            sys.exit('System will exit')        
    else:
            if options.class_name == 'arxiv-category':
                fun = 0
            elif options.class_name == 'acm-class':
                fun = 1
Ejemplo n.º 9
0
                         '--vocabularyFile',
                         dest='vocabulary',
                         help='fileName',
                         default=None)
    optparser.add_option('-o',
                         '--outputFile',
                         dest='output',
                         help='fileName',
                         default=None)

    (options, args) = optparser.parse_args()

    if options.abstract is None:
        inFile = sys.stdin
    elif options.abstract is not None:
        inFile = ioFile.dataFromFile(options.abstract)
    else:
        print 'No abstract filename specified, system with exit\n'
        sys.exit('System will exit')

    if options.vocabulary is None:
        fname = sys.stdin
    elif options.vocabulary is not None:
        fname = options.vocabulary
    else:
        print 'No vocabulary filename specified, system with exit\n'
        sys.exit('System will exit')

    if options.output is None:
        outFile = 'foo-mult.dat'
    elif options.output is not None: