def topicOfClassificationForAllYear(probDir, modelDir, classDir, clf_dict, fun): probFiles = fileSys.traverseDirectory(probDir) topicFiles = fileSys.traverseTopicDirecotry(modelDir, 1) classFiles = fileSys.traverseDirectory(classDir) N = len(probFiles) if len(topicFiles) != N or len(classFiles) != N: print "numbers of files are not same" sys.exit('System will exit') all_clf_topic = {} if fun == 0: irange = range(0, N) # acm-class start from 1998 elif fun == 1: irange = range(5, N) for i in irange: prob = ioFile.load_object(probFiles[i]) topics = ioFile.load_object(topicFiles[i]) inFile = ioFile.dataFromFile(classFiles[i]) year = probFiles[i][-8:-4] topic_index = np.squeeze(np.array(prob.argmax(1))) doc_topic = topic_index #doc_topic = [] #[doc_topic.append(' '.join(topics[index])) for index in topic_index] all_clf, unique_clf = classificationOfDocument(inFile, clf_dict, fun) clf_topic = topicOfClassification(unique_clf, all_clf, clf_dict, doc_topic, fun) all_clf_topic[year] = clf_topic return all_clf_topic
def load_sentences(fname, nb_sentences=None): """ :param nb_sentences: Use if all brown sentences are too many :return: index2word (list of string) """ from gensim.models import word2vec print 'building vocab ...' data_iterator = dataFromFile(fname) i = 0 sents = [] for line in data_iterator: # line = line.encode('utf-8') line = line.rstrip('\n') sents.append(line.split(' ')) i += 1 if nb_sentences is not None: if nb_sentences == i: break print "load", i, "sentences" # I use gensim model only for building vocab model = word2vec.Word2Vec() model.build_vocab(sents) vocab = model.wv.vocab print "vocabulary size is", len(vocab) # ids: list of (list of word-id) ids = [[ vocab[w].index for w in sent if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32 ] for sent in sents] # save_object('sentences.pkl', ids) # save_object('index2word.pkl', model.wv.index2word) return ids, model.wv.index2word
return cooccurrences def online_generator(cooccurrences, min_count=None): # Now yield our tuple sequence (dig into the LiL-matrix internals to # quickly iterate through all nonzero cells) for i, (row, data) in enumerate(izip(cooccurrences.rows, if min_count is not None and vocab[id2word[i]][1] < min_count: continue cooccurrences.data)): for data_idx, j in enumerate(row): if min_count is not None and vocab[id2word[j]][1] < min_count: continue yield ([np.array([i]), np.array([j])], np.array([data[data_idx]])) # load data data_iterator = dataFromFile('../ganyan_sentence_clean.txt') vocab, sentences = build_vocab(data_iterator) # params nb_epoch = 3 vec_dim = 50 window_size = 5 vocab_size = len(vocab) samples_per_epoch = len(sentences) # create input coocurrences = build_cooccur(vocab, sentences, window_size) # graph definition (pvt: center of window, ctx: context) input_pvt = Input(shape=(1,), dtype='int32')
default=None) optparser.add_option('-t', '--vocabularyFile', dest='vocabulary', help='fileName', default=None) optparser.add_option('-o', '--outputFile', dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.abstract is None: inFile = sys.stdin elif options.abstract is not None: inFile = ioFile.dataFromFile(options.abstract) else: print 'No abstract filename specified, system with exit\n' sys.exit('System will exit') if options.vocabulary is None: fname = sys.stdin elif options.vocabulary is not None: fname = options.vocabulary else: print 'No vocabulary filename specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = 'foo-mult.dat' elif options.output is not None:
'--inputFile', dest='input', help='filename containing txt', default=None) optparser.add_option('-o', '--outputFile', dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.input is None: inFile = sys.stdin elif options.input is not None: inFile = ioFile.dataFromFile(options.input) else: print 'No filename specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = "arxiv-category_dict.pkl" elif options.output is not None: outFile = options.output data_iterator = inFile category_dict = {} for line in data_iterator: line = line.rstrip('\n') line = line.split('\t')
optparser = OptionParser() optparser.add_option('-f', '--inputFile', dest='input', help='filename containing txt', default=None) optparser.add_option('-o', '--outputFile', dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.input is None: inFile = sys.stdin elif options.input is not None: inFile = ioFile.dataFromFile(options.input) else: print 'No filename specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = "arxiv-category_dict.pkl" elif options.output is not None: outFile = options.output data_iterator = inFile category_dict = {} for line in data_iterator: line = line.rstrip('\n') line = line.split('\t')
optparser.add_option('-d', '--classDictName', dest='clf_dict', help='fileName', default=None) optparser.add_option('-o', '--outputFile', dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.input is None: print 'No text filename specified, system with exit\n' sys.exit('System will exit') elif options.input is not None: inFile = ioFile.dataFromFile(options.input) if options.reference is None: print 'No reference filename specified, system with exit\n' sys.exit('System will exit') elif options.reference is not None: inFile_ref = ioFile.dataFromFile(options.reference) if options.class_name is None: print 'No name of the category specified, system with exit\n' sys.exit('System will exit') else: if options.class_name == 'arxiv-category': fun = 0 elif options.class_name == 'acm-class': fun = 1
'--vocabularyFile', dest='vocabulary', help='fileName', default=None) optparser.add_option('-o', '--outputFile', dest='output', help='fileName', default=None) (options, args) = optparser.parse_args() if options.abstract is None: inFile = sys.stdin elif options.abstract is not None: inFile = ioFile.dataFromFile(options.abstract) else: print 'No abstract filename specified, system with exit\n' sys.exit('System will exit') if options.vocabulary is None: fname = sys.stdin elif options.vocabulary is not None: fname = options.vocabulary else: print 'No vocabulary filename specified, system with exit\n' sys.exit('System will exit') if options.output is None: outFile = 'foo-mult.dat' elif options.output is not None: