def get_corpus(): text_path = datapath('ldavowpalwabbit.txt') dict_path = datapath('ldavowpalwabbit.dict.txt') dictionary = Dictionary.load_from_text(dict_path) with open(text_path) as fhandle: corpus = [dictionary.doc2bow(line.strip().split()) for line in fhandle] return corpus, dictionary
def __init__(self, config, trained_model_path, id2word_path): self.model_path = trained_model_path self.id2word_path = id2word_path self.model = LdaModel.load(self.model_path) self.id2word = Dictionary.load_from_text(self.id2word_path) self.num_topics = config.num_topics assert self.model.num_topics == self.num_topics
def apply_tfidf(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) mm = MmCorpus(mm_corpus_path) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm', tfidf[mm], progress_cnt=10000)
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def get_wiki_dictionary(): ''' return a dictionary that contains {"tokens": document frequency} in wikipedia corpus ''' # Unpack Wiki dump # wiki = WikiCorpus('wikicorpus/enwiki-20201120-pages-articles-multistream1.xml-p1p41242.bz2', lemmatize=False) # MmCorpus.serialize("wikicorpus/wiki-corpus.mm", wiki) # create documents to save wiki articles # documents = list() # for i, text in enumerate(wiki.get_texts()): # documents.append(text) # Dictionary of document frequencies dct = Dictionary.load_from_text("wikicorpus/wiki_dictionary") # the document size of wiki corpus is 21126 wiki_document_size = 21126 # return dictionary df_dictionary = dict() # for each word, the p(word) = document frequency / N, where N is the size of documents in this corpus id2token = {v: k for k, v in dct.token2id.items()} for token_id, document_frequency in dct.dfs.items(): # Katz smoothing to handle zero occurrences in wiki-corpus df_dictionary[id2token[token_id]] = (document_frequency + 1) / wiki_document_size return df_dictionary
def __init__(self, MODEL_PATH, DICT_LOCATION=DEFAULT_DICTIONARY_FILE_LOCATION): self.__model = LdaMulticore.load(MODEL_PATH) self.__id2word_dictionary = Dictionary.load_from_text(DICT_LOCATION) print(self.__model) print(self.__id2word_dictionary)
def get_corpus(): text_path = datapath('ldavowpalwabbit.txt') dict_path = datapath('ldavowpalwabbit.dict.txt') dictionary = Dictionary.load_from_text(dict_path) with open(text_path) as fhandle: corpus = [dictionary.doc2bow(l.strip().split()) for l in fhandle] return corpus, dictionary
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def __init__(self, dict_path, model_path): """Load an LSA space from a file. :dict_path: path to the dictionary file. :model_path: path to the model file. """ self._dictionary = Dictionary.load_from_text(dict_path) self._lsi_model = LsiModel.load(model_path)
def __init__(self, dict_path, model_path): """Load an LSA space from a file. :dict_path: path to the dictionary file. :model_path: path to the model file. """ self._dictionary = Dictionary.load_from_text(dict_path) self._lsi_model = LsiModel.load(model_path)
def get_corpus(data, save_path_dict='extracted_data/lda_dictionary'): if isfile(save_path_dict): dictionary = Dictionary.load_from_text(save_path_dict) corpus = [dictionary.doc2bow(doc) for doc in data] return corpus else: print("Didn't find a dictionary.") import sys sys.exit(1)
def __init__(self, dict_file=None, model_file=None): if dict_file: self.dictionary = Dictionary.load_from_text(dict_file) else: self.dictionary = Dictionary() if model_file: self.model = joblib.load(model_file) else: self.model = None
def main(): global args taskname = args.taskname no_below = args.no_below no_above = args.no_above n_topic = args.n_topic n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 use_tfidf = args.use_tfidf dist = args.dist model_path = args.model_path model_name = args.model_name save_dir = args.save_dir test_path = args.test_path device = torch.device('cuda') cwd = os.getcwd() tmpDir = os.path.join(cwd, 'data', taskname) if os.path.exists(os.path.join(tmpDir, 'corpus.mm')): dictionary = Dictionary.load_from_text(os.path.join( tmpDir, 'dict.txt')) else: raise Exception("Build corpus first") testSet = TestData(dictionary=dictionary, txtPath=test_path, no_below=no_below, no_above=no_above, use_tfidf=use_tfidf) voc_size = testSet.vocabsize Model = globals()[model_name] model = Model(bow_dim=voc_size, n_topic=n_topic, device=device, dist=dist, taskname=taskname) model.load_model(model_path) topics = model.show_topic_words(dictionary=dictionary) for i in range(len(topics)): print(i, str(topics[i])) infer_topics = [] for doc in tqdm(testSet): if doc is None: infer_topics.append(None) else: infer_topics.append( int( np.argmax( model.inference(doc_tokenized=doc, dictionary=dictionary)))) with open(save_dir + "/inference_result.txt", "w") as f: json.dump(infer_topics, f)
def __load_from_disk(self, path): """ Function that is used internally to load and set-up the class state :param path: Location from where the class internal state should be loaded :return: None, side-effect on the class on which this is called """ # Read config, with open(os.path.join(path, 'config.json')) as f: params = jsonpickle.decode(f.read()) self.net_size_in_days = params['net_size_in_days'] self.min_tok_len = params['min_tok_len'] self.undersample_multiplicity = params['undersample_multiplicity'] self.prediction_threshold = params['prediction_threshold'] self.use_sim_cs = params['use_sim_cs'] self.use_sim_j = params['use_sim_j'] self.use_sim_d = params['use_sim_d'] self.use_social = params['use_social'] self.use_temporal = params['use_temporal'] self.use_file = params['use_file'] self.use_pr_only = params['use_pr_only'] self.use_issue_only = params['use_issue_only'] self.predictions_between_updates = params[ 'predictions_between_updates'] name = params['name'] try: with open(os.path.join(path, name, 'repository_data.json')) as f: self.repository_obj = jsonpickle.decode(f.read()) with open(os.path.join(path, name, 'truth_data.json')) as f: self.truth = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: with open(os.path.join(path, name, 'fingerprint_data.json')) as f: self.fingerprint = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: self.dictionary = Dictionary.load_from_text( os.path.join(path, 'tfidf', 'term2id.txt')) self.model = TfidfModel.load( os.path.join(path, 'tfidf', 'model.tfidf')) with open(os.path.join(path, name, 'stopwords_data.json')) as f: self.stopwords = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: self.clf = pickle.load( open(os.path.join(path, 'clf_model', 'model.p'), 'rb')) except FileNotFoundError: pass try: self.feature_generator = pickle.load( open(os.path.join(path, 'feature_generator', 'gen.p'), 'rb')) except FileNotFoundError: pass
def makeDictionary(docList, dictFile="", add=False): ''' 生成词典 ''' if os.path.isfile(dictFile) and add: dictionary = Dictionary.load_from_text(dictFile) dictionary.add_documents(docList) else: dictionary = Dictionary(docList) # dictionary.save_as_text(dictFile) return dictionary
def test_saveAsText_and_loadFromText(self): """ `Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') d = Dictionary(self.texts) d.save_as_text(tmpf) # does the file exists self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt')) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def test_saveAsText_and_loadFromText(self): """`Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') for sort_by_word in [True, False]: d = Dictionary(self.texts) d.save_as_text(tmpf, sort_by_word=sort_by_word) self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(tmpf) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def test_saveAsText_and_loadFromText(self): """ `Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') d = Dictionary(self.texts) d.save_as_text(tmpf) # does the file exists self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt')) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def test_saveAsText_and_loadFromText(self): """`Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') for sort_by_word in [True, False]: d = Dictionary(self.texts) d.save_as_text(tmpf, sort_by_word=sort_by_word) self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(tmpf) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def get_dict(self,path): path = path + '.dict' if not os.path.exists(path): self.texts = self.get_texts() dct = Dictionary(self.texts) dct.save_as_text(path) else: dct = Dictionary() dct = dct.load_from_text(path) for path in self.inputs: self.id_to_path.append(os.path.basename(path)) return dct
def display_data(self): lda = LdaMulticore.load(self.lda_model_filepath) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath) LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open(self.LDAvis_data_filepath, 'w') as f: f.write(str(LDAvis_prepared)) # json.dump(LDAvis_prepared.to_json(), f) with open(self.LDAvis_data_filepath) as f: LDAvis_prepared = f pyLDAvis.display(LDAvis_prepared)
def test_loadFromText(self): """`Dictionary` can be loaded from textfile.""" tmpf = get_tmpfile('load_dict_test.txt') no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n") with open(tmpf, "wb") as file: file.write(no_num_docs_serialization) d = Dictionary.load_from_text(tmpf) self.assertEqual(d.token2id[u"prvé"], 1) self.assertEqual(d.token2id[u"slovo"], 2) self.assertEqual(d.dfs[1], 1) self.assertEqual(d.dfs[2], 2) self.assertEqual(d.num_docs, 2)
def test_loadFromText(self): """`Dictionary` can be loaded from textfile.""" tmpf = get_tmpfile('load_dict_test.txt') no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n") with open(tmpf, "wb") as file: file.write(no_num_docs_serialization) d = Dictionary.load_from_text(tmpf) self.assertEqual(d.token2id[u"prvé"], 1) self.assertEqual(d.token2id[u"slovo"], 2) self.assertEqual(d.dfs[1], 1) self.assertEqual(d.dfs[2], 2) self.assertEqual(d.num_docs, 2)
def get_dictionary(self): tmp_fname = self.path + "lda.dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.save_as_text(tmp_fname) return dictionary
def load_corpus(self, corpus_name): ''' This is were we load the corpus files. This needs to be moved to a more general class initialization. (FIXME Freija) ''' corpusfile = corpus_name + '.mm' corpusdict = corpus_name + '_wordids.txt' lsimodel = corpus_name + '.lsi_model' lsiindex = corpus_name + '-lsi.index' self.corpus_name = corpus_name self.corpus_mm = MmCorpus(corpusfile) self.corpus_dict = Dictionary.load_from_text(corpusdict) self.model = LsiModel.load(lsimodel) self.index = similarities.MatrixSimilarity.load(lsiindex)
def load_corpus(self, corpus_name): ''' This is were we load the corpus files. This needs to be moved to a more general class initialization. (FIXME Freija) ''' corpusfile = corpus_name + '.mm' corpusdict = corpus_name + '_wordids.txt' lsimodel = corpus_name + '.lsi_model' lsiindex = corpus_name + '-lsi.index' self.corpus_name = corpus_name self.corpus_mm = MmCorpus(corpusfile) self.corpus_dict = Dictionary.load_from_text(corpusdict) self.model = LsiModel.load(lsimodel) self.index = similarities.MatrixSimilarity.load(lsiindex)
def load_corpus_and_dict(corpus_path, id2word_path): print("[BLOCK] Loading corpus and dictionary files from %s and %s" % (data_path, id2word_path)) sys.stdout.flush() dictionary = Dictionary.load_from_text(id2word_path) print("[BLOCK] Loading corpus iterator") sys.stdout.flush() #mm = gensim.corpora.MmCorpus(corpus_path) corpus = MmCorpus( bz2.BZ2File(corpus_path) ) # use this if you compressed the TFIDF output (recommended) return corpus, dictionary
def get_dictionary(self): tmp_fname = self.path + self.model_type + "_dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=20, no_above=0.5) dictionary.save_as_text(tmp_fname) return dictionary
def main(): global dictionary try: dictionary = Dictionary.load_from_text( "persist/reuters_dictionary.txt") #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2") except: dictionary = Dictionary(ReutersCorpus()) dictionary.filter_extremes() dictionary.save_as_text("persist/reuters_dictionary.txt") models = train_models() if settings["models"]["bow"]: bowmodel = BOWmodel() bowmodel.__out_size = len(dictionary) models["bow"] = bowmodel if settings["models"]["noise"]: noisemodel = NoiseModel(1000) noisemodel.__out_size = 1000 models["noise"] = noisemodel num_train_samples = 21578 - settings["held_out_docs"] test_samples = [] class generate_train_samples(object): first_iteration = True def __iter__(self): count = 0 for document in stream_reuters_documents(): sample = document["content"], "acq" in document[ "topics"] # todo: maybe try "usa" or "earn" if count > num_train_samples: if self.first_iteration: test_samples.append(sample) else: yield sample count += 1 self.first_iteration = False classifiers = train_classifiers(models, generate_train_samples()) classifications = run_evaluation(classifiers, models, test_samples) #output_results(classifications) return classifications
def __init__(self, model_prefix='wiki_en'): logger = logging.getLogger("LDA") self.model_prefix = model_prefix if self.model_prefix is None: raise ValueError("model_prefix must be specified") self.fname = 'lda_model.p' logger.info("LDA: Loading word dictionary...") self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt') logger.info("LDA: Loading pretrained model...") self.model = pickle.load(open(self.fname, 'r')) logger.info("LDA: Finished loading model files.")
def __init__(self, model_prefix = 'wiki_en'): logger = logging.getLogger("LDA") self.model_prefix = model_prefix if self.model_prefix is None: raise ValueError("model_prefix must be specified") self.fname = 'lda_model.p' logger.info("LDA: Loading word dictionary...") self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt') logger.info("LDA: Loading pretrained model...") self.model = pickle.load(open(self.fname, 'r')) logger.info("LDA: Finished loading model files.")
def main(): global dictionary try: dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt") #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2") except: dictionary = Dictionary(ReutersCorpus()) dictionary.filter_extremes() dictionary.save_as_text("persist/reuters_dictionary.txt") models = train_models() if settings["models"]["bow"]: bowmodel = BOWmodel() bowmodel.__out_size = len(dictionary) models["bow"] = bowmodel if settings["models"]["noise"]: noisemodel = NoiseModel(1000) noisemodel.__out_size = 1000 models["noise"] = noisemodel num_train_samples = 21578 - settings["held_out_docs"] test_samples = [] class generate_train_samples(object): first_iteration = True def __iter__(self): count = 0 for document in stream_reuters_documents(): sample = document["content"], "acq" in document["topics"] # todo: maybe try "usa" or "earn" if count > num_train_samples: if self.first_iteration: test_samples.append(sample) else: yield sample count += 1 self.first_iteration = False classifiers = train_classifiers(models, generate_train_samples()) classifications = run_evaluation(classifiers, models, test_samples) #output_results(classifications) return classifications
def test_loadFromText_legacy(self): """ `Dictionary` can be loaded from textfile in legacy format. Legacy format does not have num_docs on the first line. """ tmpf = get_tmpfile('load_dict_test_legacy.txt') no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n") with open(tmpf, "wb") as file: file.write(no_num_docs_serialization) d = Dictionary.load_from_text(tmpf) self.assertEqual(d.token2id[u"prvé"], 1) self.assertEqual(d.token2id[u"slovo"], 2) self.assertEqual(d.dfs[1], 1) self.assertEqual(d.dfs[2], 2) self.assertEqual(d.num_docs, 0)
def test_loadFromText_legacy(self): """ `Dictionary` can be loaded from textfile in legacy format. Legacy format does not have num_docs on the first line. """ tmpf = get_tmpfile('load_dict_test_legacy.txt') no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n") with open(tmpf, "wb") as file: file.write(no_num_docs_serialization) d = Dictionary.load_from_text(tmpf) self.assertEqual(d.token2id[u"prvé"], 1) self.assertEqual(d.token2id[u"slovo"], 2) self.assertEqual(d.dfs[1], 1) self.assertEqual(d.dfs[2], 2) self.assertEqual(d.num_docs, 0)
def crawl_new_article(request): news_list = get_current_news_article() # 取得してきたニュースをレコメンドすべきか判断 download_blob('word/all_id2word.txt', '/tmp/all_id2word.txt') dct = Dictionary.load_from_text("/tmp/all_id2word.txt") download_blob('model_2.pickle', '/tmp/model_2.pickle') with open('/tmp/model_2.pickle', mode='rb') as f: classifier = pickle.load(f) bow_docs = make_bow(dct) result = predict(news_list, dct, classifier, bow_docs) upsert_new_articles(result) return {"status": "ok"}
def loadDictionary(self, type='offline'): ''' 加载字典,若字典不存在则建立字典 ''' startTime = datetime.now() filePath = self.cachePath + '%s_dictionary_%s.txt' % (self.name, type) if os.path.isfile(filePath): dictionary = Dictionary.load_from_text(filePath) else: if type == 'offline': docList = self.getDocList('train') dictionary = makeDictionary(docList) elif type == 'all': docList = [] if os.path.isfile(self.cachePath + '%s_dictionary_online.txt' % self.name): logging.warning('dictionary continue') docList.extend(self.getDocList('testA')) docList.extend(self.getDocList('testB')) dictionary = makeDictionary( docList, dictFile=self.cachePath + '%s_dictionary_online.txt' % self.name, add=True) else: for dfName in self.dfFile.keys(): docList.extend(self.getDocList(dfName)) dictionary = makeDictionary(docList) elif type == 'online' and os.path.isfile( self.cachePath + '%s_dictionary_offline.txt' % self.name): docList = self.getDocList('valid') dictionary = makeDictionary( docList, dictFile=self.cachePath + '%s_dictionary_offline.txt' % self.name, add=True) else: docList = self.getDocList('train') docList.extend(self.getDocList('valid')) dictionary = makeDictionary(docList) dictionary.save_as_text(filePath) logging.warning('make dictionary time: %s' % (datetime.now() - startTime)) self.dictionary[type] = dictionary return dictionary
def get_dictionary_corpus(data, save_path_dict='extracted_data/lda_dictionary', save_path_bcorp='extracted_data/lda_bow_corpus'): if isfile(save_path_dict): dictionary = Dictionary.load_from_text(save_path_dict) corpus = gensim.corpora.MmCorpus(save_path_bcorp) else: dictionary = gensim.corpora.Dictionary(data) dictionary.filter_extremes(no_above=0.5, keep_n=100000) corpus = [dictionary.doc2bow(doc) for doc in data] dictionary.save_as_text(save_path_dict) gensim.corpora.MmCorpus.serialize(save_path_bcorp, corpus) # bow_doc_2 = bow_corpus[2] # for i in range(len(bow_doc_2)): # print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], dictionary[bow_doc_2[i][0]], bow_doc_2[i][1])) return corpus, dictionary
def documentFrequencies(): dictionary = Dictionary.load_from_text( 'C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_wordids.txt.bz2') print(max(dictionary.token2id.values())) #... get the id corresponding to token "hello" tokenid = (dictionary.token2id["hello"]) print(tokenid) #... get the document frequencies in the full corpus for which "hello" appeared print(dictionary.dfs[dictionary.token2id["hello"]]) #... compute the total number of features in this corpus print(len(dictionary)) #... CONSTRUCT THE Document Frequency OUTPUT FILE dforig = dictionary.dfs dfdict = {} for key, val in dforig.items(): dfdict[str(dictionary[key])] = val fieldnames = ["term", "df"] with open("document_frequencies.tsv", "w+", encoding="utf-8") as handle: writer = csv.writer(handle, delimiter="\t") #writer.writerows(dfdict) for key, val in dfdict.items(): writer.writerow([key, val]) handle.close() #... load in the bag-of-words matrix market file for comparison mm_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_bow.mm" wikimodel = MmCorpus(mm_name) #... the matrix market should have the same number of features as len(dictionary) print(wikimodel) #... checked and verified that "hello" appears in the same number of documents as computed earlier if False: counter = 0 featcount = 0 for doc in wikimodel: res = [x for x in doc if x[0] == tokenid] if len(res) > 0: #print (counter, ":",res) featcount += 1 #break print(featcount)
def get_lda_topics(transcript_utterances, trained_lda_model_filepath, trained_lda_wordids_filepath): ''' Parameters transcript_utterances: list of lists of strings (words), each row is a plaintext utterance in the transcript. trained_lda_model_filepath: string, path to trained LDA model ('/p/spoclab/models/LDA/lda_model_wiki'). trained_lda_wordids_filepath: string, path to word IDs of trained LDA model (''/p/spoclab/models/LDA/lda_wordids.txt.bz2). Returns: topic_probabilities: list of floats, probability of each k topic. kurtosis: float, kurtosis of all topic probablities. skewness: float, skewness of all topic probabilities. entrpy: float, entropy of all topic probabilities. ''' # Get files trained_lda_model = return_file(trained_lda_model_filepath) trained_lda_wordids = return_file(trained_lda_wordids_filepath) # Load LDA model lda_model = ldamodel.LdaModel.load(trained_lda_model) # Load wordids as a dictionary id2word = Dictionary.load_from_text(trained_lda_wordids) # Convert transcript of tokens into a BoW document document_bow = [] for transcript_utterance in transcript_utterances: document_bow += id2word.doc2bow(transcript_utterance) # Get document topics doc_topics = lda_model.get_document_topics(document_bow, minimum_probability=0) topic_probabilities = [doc_topic[1] for doc_topic in doc_topics] skewness = stats.skew(topic_probabilities) kurtosis = stats.kurtosis(topic_probabilities) # Entropy: SUM(-plog2p) entropy = np.sum([-(p * np.log2(p)) for p in topic_probabilities]) return topic_probabilities, kurtosis, skewness, entropy
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute( 'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load back the id->word mapping directly from file fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm") mm = MmCorpus(fin) # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True) # save the TfidfModel instance to file fout = path.join(datadir, "reuters21578.tfidf.model") tfidf.save(fout) # save TF-IDF vectors in matrix market format fout = path.join(datadir, "reuters21578.tfidf.mm") MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load the LDA model fin = path.join(datadir, "reuters21578.lda.model.bz2") lda = LdaModel.load(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm.bz2") mm = MmCorpus(fin) # load the vocabulary fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) data = pyLDAvis.gensim.prepare(lda, corpus, vocabulary) pyLDAvis.show(data, ip=socket.gethostname().lower(), local=True, open_browser=True, http_server=None)
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) mywiki = myWikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) # check and process input arguments if len(sys.argv) < 3: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.")
import os import matplotlib.pyplot as plt import multiprocessing as mp import numpy as np import scipy.stats as stats import time from gensim import matutils from gensim.corpora import Dictionary, MmCorpus from gensim.models import LdaMulticore from os import path datadir = path.abspath(path.join(os.getcwd(), "data")) fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) fin = path.join(datadir, "reuters21578.mm.bz2") mm = MmCorpus(fin) def sym_kl(p, q): return np.sum([stats.entropy(p, q), stats.entropy(p, q)]) def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10): l = np.array([sum(cnt for _, cnt in doc) for doc in corpus]) kl = [] for n in range(min_topics, max_topics+step, step): print("starting multicore LDA for num_topics={}".format(n)) st = time.clock() lda = LdaMulticore(corpus=corpus,
def build_model(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) # Use the if-idf corpus here, not the original one. mm = MmCorpus(mm_corpus_path) lsi = lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=400) lsi.save('/home/andre/Develop/corpora/lsamodel_lsi.model')
program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) # check and process input arguments if len(sys.argv) < 2: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0
else: ## not online # takes about 9h on a macbook pro, for 3.5m articles (june 2011) wiki = WikiCorpus( args.input, lemmatize=args.lemmatize, max_articles=args.max_articles, expect_streamitems=args.expect_streamitems, file_name_pattern=args.file_name_pattern, ) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(args.output + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(args.output + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(args.output + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(args.output + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %r" % args.__dict__)
def apply_tfidf(dictionary_path, mm_corpus_path): dictionary = Dictionary.load_from_text(dictionary_path) mm = MmCorpus(mm_corpus_path) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm', tfidf[mm], progress_cnt=10000)
def build_corpus(dictionary_path): dictionary = Dictionary.load_from_text(dictionary_path) corpus = CorpusIterator(dir_list=dir_list, bow=True, dictionary=dictionary) MmCorpus.serialize( '/home/andre/Develop/corpora/lsamodel_bow.mm', corpus, progress_cnt=10000)
def load_dict(): return Dictionary.load_from_text('./dict.txt')
def __init__(self, lda_file, dic_file): self.lda_model = LdaModel.load(lda_file) self.dictionary = Dictionary.load_from_text(dic_file)
with codecs.open(self.path_, 'r', 'utf-8') as in_f: for line in in_f: doc = [word for word in line.strip().split() if len(word) > 0 and word in tokens] doc = vocab.doc2bow(doc) if len(doc) > 0: yield doc if __name__ == "__main__": logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO ) vocab = Dictionary.load_from_text('./vocab.txt') corpus = UnlabeledCorpus('./rumor_train.csv', vocab) valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab) valid_sentences = [doc for doc in valid_corpus][5000:] # varing number of topics # result = {} # for num_topics in [2, 4, 8, 16, 32, 64]: # best_value = -100 # for i in range(5): # model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics) # likelihood = model.log_perplexity(valid_sentences) # best_value = max(best_value, likelihood) # result[num_topics]= best_value # # for num_topics, likelihood in result.iteritems():
from utils import generate_timestamp logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries) # extract num_topics LSI topics; use the default one-pass algorithm num_topics = 400 model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics) # print the most contributing words (both positively and negatively) for each of the first ten topics model.print_topics(10) model.save("%s/%s.model" % (args.model, timestamp))
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True) wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True) dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True) # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2', use_bzip2=True) del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def __init__(self): self.model = LdaModel.load(settings.lda_model_name) self.dictionary = Dictionary.load_from_text(settings.wordids_txt)
print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep the most frequent words (out of total ~8.2m # unique tokens) takes about 9h on a macbook pro, for 3.5m articles (june 2011) wiki = WikiCorpus(inp, keep_words=keep_words) # save dictionary and bag-of-words (term-document frequency matrix) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt') MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) del wiki # initialize corpus reader and word->id mapping id2token = Dictionary.load_from_text(outp + '_wordids.txt') mm = MmCorpus(outp + '_bow.mm') # build tfidf, # ~30min tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format # ~2h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)