def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ if lsa_training: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) # generate LSA model lsi_model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # train model #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence() #print("Coherence value : ",coherence_value) print('Saving lsi_model...') lsi_model.save(lsi_model_path) print('lsi_model saved!') corpus_lsi = lsi_model[doc_term_matrix] with open(corupus_lsi_path, 'wb') as handle: pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Corpus_lsi saved.') else: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) print('Loading lsi_model...') lsi_model = LsiModel.load(lsi_model_path) print('lsi_model Loaded!') corpus_lsi = lsi_model[doc_term_matrix] return lsi_model, corpus_lsi, dictionary
def train_lsa(is_tfidf, num_topics): # Create corpus print('Create corpus') corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf) # Set training parameters. num_topics = num_topics chunksize = 20000 start = time.time() temp = dictionary[0] id2word = dictionary.id2token print('Start LSI training') lsi_model = LsiModel( corpus=corpus, id2word=id2word, num_topics=num_topics, chunksize=chunksize, ) lsi_model.show_topics() ir_method = 'tfidf' if is_tfidf else 'bow' lsi_model.save('saved_models/lsi_model_%s_%s' % (ir_method, num_topics)) print('LSA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def lsi(dataframe, num_topics=300): """Returns an LSI model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. num_topics : int (default is 300) The number of topics to train the LSI model with. Returns ------- model : Gensim LsiModel LSI model for documents stored in the DataFrame. """ filename = 'caches/models/lsi.model' if not os.path.isfile(filename): dictionary = dictionary_corpus(dataframe) bow = bow_corpus(dataframe) tfidf_model = tfidf(dataframe) tfidf_corpus = tfidf_model[bow] lsi_model = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=num_topics) lsi_model.save(filename) else: lsi_model = LsiModel.load(filename) return lsi_model
class LsiVec(TopicVec): def __init__(self, vec_num): TopicVec.__init__(self, vec_num) def __gen_model(self, corpus): # if self.p_corpus == 'onehot': # model_name = 'lsi_one_hot.model' # else: # model_name = 'lsi_tfidf.model' model_name = 'lsi.model' self.model = LsiModel(corpus, id2word=self.dictionary, num_topics=self.vec_num) self.model.save(os.path.join(self.out_dir, model_name)) def __get_model(self): model_name = 'lsi.model' if os.path.exists(os.path.join(self.out_dir, model_name)): self.model = LsiModel.load(os.path.join(self.out_dir, model_name)) else: raise FileNotFoundError('"{}" file not found!'.format(model_name)) def fit(self, doc, out_dir, use_exist_dictionary=False): TopicVec.fit(self, doc, out_dir, use_exist_dictionary) self.__gen_model(self.corpus)
class LatentSemanticIndexing(): """ This class implements Latent semantic indexing using the genims library. """ def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000): self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics) self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics) self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics) self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics) self.embedding = embedding self.corpus = corpus self.num_topics = num_topics if os.path.exists(self.lsi_model_path): print("LSI {} model already trained, loading from disk.".format(embedding)) self.model = LsiModel.load(self.lsi_model_path) else: # Make a index to word dictionary. temp = corpus.dictionary[0] # This is only to "load" the dictionary. id2word = corpus.dictionary.id2token print("Training LSI model.") self.model = LsiModel( corpus=list(corpus.get_corpus()), id2word=id2word, chunksize=chunksize, num_topics=num_topics ) print("Saving LSI model.") self.model.save(self.lsi_model_path) self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path) if os.path.exists(self.sim_matrix_path): print("Similarities matrix {} model already trained, loading from disk.".format(embedding)) self.index = similarities.Similarity.load(self.sim_matrix_path) else: print("Creating similarities index.") Path(self.sim_matrix_temp_path).touch(exist_ok=True) self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics) self.index.save(self.sim_matrix_path) def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) if self.embedding == "bow": lsi_query = self.model[vec_query] elif self.embedding == "tfidf": lsi_query = self.model[self.corpus.tfidf_model[vec_query]] sims = self.index[lsi_query] sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1]) return sims
def train_models(): models = dict() if settings["models"]["msda"]: dims = settings["dimensionalities"]["msda"] try: msda = mSDA.load("reuters_msda_%sdims" % dims) # the line below is for testing a model I have locally on my machine #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3") except: ln.info("Training mSDA...") prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]] msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids) msda.train(bow_corpus()) msda.save("reuters_msda_%sdims" % dims) msda.__out_size = dims models["msda"] = msda if settings["models"]["lsi"]: dims = settings["dimensionalities"]["lsi"] try: lsi = LsiModel.load("reuters_lsi_%sdims" % dims) except: ln.info("Training LSI...") lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary) lsi.save("reuters_lsi_%sdims" % dims) lsi.__out_size = dims models["lsi"] = lsi return models
def train_model(self, num_topics): corpus = self.get_corpus() model = LsiModel(corpus, num_topics=num_topics) tmp_fname = self.path + self.model_type + "_model" model.save(tmp_fname) return model
def train(n_topics=num_topics): docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50 ) # save the dictionary with open(os.path.join(folder_path_objects, 'dictionary_lsi_bow'), 'wb') as f: pickle.dump(dictionary, f) # create binary and regular bow corpus corpus_bow = [dictionary.doc2bow(d) for d in docs] corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] # save corpuses with open(os.path.join(folder_path_objects, 'corpus_binary'), 'wb') as f: pickle.dump(corpus_binary, f) # create models print(f'{time.ctime()} Start training LSA (binary bow)') lsi_bin = LsiModel( corpus=corpus_binary, id2word=dictionary, chunksize=1000, num_topics=n_topics ) # save models to disk os.makedirs(folder_path_models, exist_ok=True) lsi_bin.save('./models/lsi_bin_filtered')
def lsi(clean_docs, model_name, topics): from gensim import corpora # turn all data into a dictionary mappping of normalized words and their integer ids dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LSI model from gensim.models import LsiModel num_topics = topics # find this number of topics in the data lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary) lsimodel.save('lsi_model_' + model_name + '.gensim') topics = lsimodel.print_topics(num_words=5) for topic in topics: print(topic)
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel( corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def build_lsi_model(dictionary, corpus, should_rebuild): lsi = list() # DEBUG should_rebuild = True if not should_rebuild: try: print('Loading LSI Model backup...') lsi_file = utils.get_file_path(cfg.LDA_BACKUP) print('LSI file = {}'.format(lsi_file)) lsi = LdaModel.load(lsi_file) except Exception as exc: utils.print_exception_details('Building LSI Model', exc) else: print('Building LSI Model...') one_pass = cfg.NUM_PASSES > 1 lsi = LsiModel(corpus, id2word=dictionary, num_topics=cfg.NUM_TOPICS, onepass=one_pass) print('Done!') # Save Model Structures LSI_FILE = utils.get_file_path(cfg.LSI_BACKUP) lsi.save(LSI_FILE) return lsi
def train_LSIModel(tokens, num_top): # reuters_text = open("test2.txt", "r") dct = corpora.Dictionary(tokens) document_matrix = [dct.doc2bow(article) for article in tokens] model = LsiModel(document_matrix, num_topics=num_top, id2word=dct) model.save("test2.LSIModel") return model
def save_lsi_model(corpus_tfidf, dictionary): # apply transformation to whole corpus print("lsi model") lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=3000) # initialize LSI transformation tmp_fname = get_tmpfile("lsi.model") print("saving tmp file") lsi.save(tmp_fname) return tmp_fname
class LSI(): @timed def init_lsi(self, **kwargs): # handle onepass=False model_sparse = self.sparse class __sparse(): def __iter__(self): for a in model_sparse: yield [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)] self.lsi = LsiModel(__sparse(), **kwargs) self.lsi.save(self.path + 'lsi.pkl') def load_lsi(self): self.lsi = LsiModel.load(self.path + 'lsi.pkl') def load_dense(self, storage='disk'): self.dense = sorbet(self.path + 'dense', kind=storage).load() def sparse_to_dense(self, sparse): dense = self.lsi[sparse] dense = sparse2full(dense, self.lsi.num_topics) dense = array('f', dense) return dense @timed def init_dense(self, storage=None, workers=None): _workers = workers or self.params.get( 'dense__workers') or self.params.get('workers', 1) _storage = storage or self.params.get( 'dense__storage') or self.params.get('storage', 'disk') if _workers > 1: self._init_dense_mp(workers=_workers, storage=_storage) else: self._init_dense_sp(storage=_storage) def _init_dense_sp(self, storage='disk'): self.dense = sorbet(self.path + 'dense', kind=storage).new() for a in self.sparse: sparse = [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)] dense = self.sparse_to_dense(sparse) self.dense.append(dense) self.dense.save() def _init_dense_mp(self, workers, storage): chunksize = self.params.get('dense__chunksize', 10) s = sorbet(self.path + 'dense').new() id_iter = range(len(self.meta)) id_iter = tqdm(id_iter, 'dense', len(self.meta)) with mp.Pool(workers, init_dense_worker, [ self.path, ]) as pool: dense = pool.imap(dense_worker, id_iter, chunksize) for d in dense: s.append(d) self.dense = s.save()
def train(text_corpus_file, dict_file): """train lsi model from text corpus""" gutenberg_corpus = TextCorpus(text_corpus_file) dict = Dictionary.load(dict_file) lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400) lsi.save(model_file) print lsi.projection.u print lsi.projection.u.size print lsi.projection.u[0].size
def getLsiModel(tfidfModel) -> LsiModel: modelPath = os.path.join('.cache', 'lsi.gensim_model') try: lsiModel = LsiModel.load(modelPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) lsiModel = LsiModel(corpus, num_topics=200) lsiModel.save(modelPath) return lsiModel
def train_LSI(corpus, name, num_topics=500): tic = time.perf_counter() LSI_model = LsiModel(corpus, id2word=dictionary, num_topics=num_topics) toc = time.perf_counter() print(f"Trained LSI {name} in {toc - tic:0.4f} seconds") # ~4min LSI_model.save(f'/LSI_{name}_model_{num_topics}.mm') return LSI_model
def train(self, dataset): corpus, dictionary = self._prepare(dataset) dictionary.save('../models.nosync/lsa/dict') print('starting LSA') model = LsiModel(corpus=corpus, id2word=dictionary.id2token, num_topics=self.c.lsa_topics) path = '../models.nosync/lsa/model' model.save(path) return model, corpus
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join(HERE, "sogou.dict")) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, "sogou.model")) lsi_model.save(os.path.join(HERE, "sogou.lsi")) print "save dictionary and tfidf model" """
def lsimodel(self, corpus_t=None, topic=200, save=False, savename=None): """ :param tfidf: :param topic: :return: """ print('using Lsimodel...') lsimodel = LsiModel(corpus=corpus_t, id2word=self.word_dict, num_topics=topic) if save: print('输出lsi模型到文件:{}'.format(savename)) lsimodel.save(savename) return lsimodel
def build_and_save_lsi_model(): print('Connecting to the database...') sentences = SentencesIterator(tokens_generator) dct = Dictionary(sentences) # Corpus as dictionary ids lists, in memory # Can be transformed in an iterable as done with the others if needed print('Calculating the LSI model...') bow_corpus = [dct.doc2bow(s) for s in sentences] model = LsiModel(bow_corpus, id2word=dct) model.print_debug() model.save(LSI_MODEL_FILE) for t in range(model.get_topics().shape[0]): print(t) print(model.print_topic(t))
def main(Tweet=None): qs = Tweet.objects.filter(is_strict__gte=13) tweets = np.array(qs.values_list('pk', 'text', 'user__screen_name', 'user__is_bot')) tweets = pd.DataFrame(np.array(tweets), columns='pk text user is_bot'.split()) tweets = tweets.set_index('pk', drop=True) tweets['tokens'] = tweets.text.apply(casual_tokenize) vocab = Dictionary(tweets.tokens) tfidf = TfidfModel(dictionary=vocab, id2word=vocab) bows = pd.Series(vocab.doc2bow(toks) for toks in tweets.tokens) lsi = LsiModel(tfidf[bows], num_topics=80, id2word=vocab, extra_samples=100, power_iters=2) lsi.save('/home/hobs/src/hackor/twote/data/lsi{}x{}x{}.saved'.format(len(tweets), lsi.num_topics, lsi.num_terms)) topics = lsi[tfidf[bows]] topics = pd.DataFrame([dict(d) for d in topics], index=tweets.index, columns=range(80))
def train(corpus, dictionary): print("Training model ...") print("Number of topics:", ARGS.num_topics) if ARGS.model_type == "LSI": print(corpus) print(ARGS.num_topics) model = LsiModel(corpus, id2word=dictionary, num_topics=ARGS.num_topics) model.save(ARGS.save_dir + "/models/"+ARGS.model_type+"_"+ARGS.corpus_type+".mm") elif ARGS.model_type == "LDA": model = LdaModel(corpus, id2word=dictionary, num_topics=ARGS.num_topics) model.save(ARGS.save_dir + "/models/"+ARGS.model_type+"_"+ARGS.corpus_type+".mm") return model
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) #corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join( HERE, 'sogou.dict')) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, 'sogou.model')) lsi_model.save(os.path.join(HERE, 'sogou.lsi')) print 'save dictionary and tfidf model' '''
def train_and_save_gensim_model(model_type_str, corpus, dct, file_name='model_300.model', num_topics=None): if model_type_str == "lsi": model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dct) elif model_type_str == "lda": model = LdaModel(corpus=corpus, alpha='auto', num_topics=num_topics, id2word=dct) elif model_type_str == "hdp": model = HdpModel(corpus=corpus, id2word=dct) model.save(file_name) return model
def train_lsa(docs: Iterable, outputFolder: str): docs = list(docs) id2word = Dictionary(docs) id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000) corpus = [id2word.doc2bow(doc) for doc in docs] corpus = log_entropy_norm(corpus) print("Starting training...") lsa = LsiModel(corpus=corpus, id2word=id2word, num_topics=300) path = outputFolder + "/lsa.model" lsa.save(outputFolder + "/lsa.bin") matrix = np.transpose(lsa.get_topics()) with open(path, "wt", encoding='utf-8') as f: f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1))) for idx in range(np.size(matrix, 0)): f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n") print("Model saved to ", path)
def train(self): print("Reading serializations...") sr = SerializationReader(self.series) documents, doc2idx, idx2doc = sr.read() print("Building dictionary...") dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] print("Building model...") lsi = LsiModel(corpus, id2word=dictionary, num_topics=self.dimensions) print("Building index...") index = MatrixSimilarity(lsi[corpus]) print("Saving...") dictionary.save(self.dictionary) lsi.save(self.lsi) index.save(self.index)
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel(corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
def save_model(docs,file_path): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = Dictionary(docs) # dictionary.filter_extremes(no_below=20, no_above=0.5) corpus = [dictionary.doc2bow(doc) for doc in docs] # model = TfidfModel(corpus) # fit model # corpus = model[corpus] CHUNKSIZE = 500 passes = 10 temp = dictionary[0] NUM_TOPICS = 15 model = LsiModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary) model.save(file_path)
def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, model_file_path: str, language_processed_data: list): logging.info("---- Create LSI model ") tf_idf = TfidfModel(self.essentials.corpus) tf_idf_corpus = tf_idf[self.essentials.corpus] model = LsiModel(tf_idf_corpus, id2word=self.essentials.dictionary, num_topics=self.number_of_topics) model.save(model_file_path) self.model = model logging.info("---- LSI model is created") metrics = self.get_model_evaluation_metrics(language_processed_data) parameters = self.get_model_parameters() self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version, param_name, param_version, metrics, parameters) return
def train_models(): models = dict() if settings["models"]["msda"]: dims = settings["dimensionalities"]["msda"] try: msda = mSDA.load("reuters_msda_%sdims" % dims) # the line below is for testing a model I have locally on my machine #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3") except: ln.info("Training mSDA...") prototype_ids = [ id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims] ] msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids) msda.train(bow_corpus()) msda.save("reuters_msda_%sdims" % dims) msda.__out_size = dims models["msda"] = msda if settings["models"]["lsi"]: dims = settings["dimensionalities"]["lsi"] try: lsi = LsiModel.load("reuters_lsi_%sdims" % dims) except: ln.info("Training LSI...") lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary) lsi.save("reuters_lsi_%sdims" % dims) lsi.__out_size = dims models["lsi"] = lsi return models
def train_lsi(corpus, dictionary, num_topics, corpus_type): """ Train the LSI model given the dataset for a given amount of topics. """ #train model and save for later use model_filename = 'lsi_' + str(corpus_type) + '_num_topics=' + str( num_topics) + '.model' model_path = './tmp/' + model_filename if not os.path.exists(model_path): print(('Starting training {} lsi for num_topics = {}').format( corpus_type, num_topics)) lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, onepass=False) lsi.save(model_path) else: print(('{} Lsi for num_topics = {} is already created, loading now...' ).format(corpus_type, num_topics)) lsi = LsiModel.load(model_path) #construct BOW index for trained lsi model, save for later use index_filename = 'index_' + str(corpus_type) + '_num_topics=' + str( num_topics) + '.mm.index' index_path = './tmp/' + index_filename if not os.path.exists(index_path): print(('Starting construction {} index for num_topics = {}').format( corpus_type, num_topics)) index = similarities.MatrixSimilarity(lsi[corpus]) index.save(index_path) else: print(( 'index for {} corpus with num_topics = {} is already created, loading now...' ).format(corpus_type, num_topics)) index = similarities.MatrixSimilarity.load(index_path) return lsi, index
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize( MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf) # TRAINING # lsa model if not os.path.exists(f_lsa): lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim) lsa.save(f_lsa) # word2vec model class MyCorpus(): def __iter__(self): for d in corpus.get_texts(): yield [w for w in d if w in corpus.dictionary.token2id] if not os.path.exists(f_w2v): w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5) w2v.save_word2vec_format(f_w2v, binary=True) # LANGUAGE MODELS lm_cache = models.Cache(window=50) lm_lsa = models.LSA(f_lsa, f_dict, tfidf=f_tfidf, window=50) lm_w2v = models.Word2Vec(f_w2v, window=50)
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries) # extract num_topics LSI topics; use the default one-pass algorithm num_topics = 400 model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics) # print the most contributing words (both positively and negatively) for each of the first ten topics model.print_topics(10) model.save("%s/%s.model" % (args.model, timestamp))
tweetids = pd.Series(range(6), name='tweet') topicids = pd.Series(range(lsi.num_topics), name='topic') pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids, name='tweet') for i in tweetids], index=tweetids) # In[29]: lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2) lsi2 # In[30]: lsi.save(os.path.join(DATA_PATH, 'lsi100')) lsi2.save(os.path.join(DATA_PATH, 'lsi2')) # In[16]: lsi2.show_topics() # In[23]: # for topic in lsi.show_topics(): # print(topic) lsi.show_topic(0, 100)
elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
# topic_id = 0 # for topic in lsi_model.show_topics(): # topic_id+=1 # print "TOPIC (LSI) " + str(topic_id) + " : " + topic # lsi_model.print_topic(20, topn=10) # corpus_lsi = lsi_model[corpus] corpus_tfidf = tfidf_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) # corpus_lsi_2 = lsi_model_2[corpus] print "Done creating models" lsi_model_2.save("wiki_en_model.lsi") # lsi_model_2 .print_topics(5) """ topic_id = 0 for topic in lsi_model_2.show_topics(): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic #group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5] group_topic = [doc for doc in corpus_lsi_2] print str(group_topic) topic_id+=1 """ print "Docs Processed " + str(lsi_model_2.docs_processed)
def load_model(wordid_txt_file, tfidf_txt_file, model_file): id2word = Dictionary.load_from_text(wordid_txt_file) mm = MmCorpus(tfidf_txt_file) lsi = LsiModel(corpus=mm, id2word=id2word, num_topics=400) lsi.save(model_file) return lsi
logent_document = logent_transformation[[bow_document]] # converts a single document to log entropy representation. document must be in the same vector space as corpus. documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents) # use a generator expression because... logent_documents = logent_transformation[bow_documents] # ...transformation is done during iteration of documents using generators, so this uses constant memory ### Chained transformations logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary) # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400) # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, num_features=400) # Performs same operation as above, but with implicit chaining # Can persist transformation models, too. logent_transformation.save("logent.model") lsi_transformation.save("lsi.model") ### Similarities (the best part) from gensim.similarities import Similarity documents = ["A bear walked in the dark forest.", "Tall trees have many more leaves than short bushes.", "A starship may someday travel across vast reaches of space to other stars.", "Difference is the concept of how two or more entities are not the same."] # A corpus can be anything, as long as iterating over it produces a representation of the corpus documents as vectors. corpus = (dictionary.doc2bow(tokenize_func(document)) for document in documents) index = Similarity(corpus=lsi_transformation[logent_transformation[corpus]], num_features=400, output_prefix="shard") print "Index corpus:"
# convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(review) for review in abstract_vectors] corpora.MmCorpus.serialize(corpus_filename, corpus) else: corpus = corpora.MmCorpus(corpus_filename) # vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts print("lsi") lsi_filename = 'model.lsi' if not os.path.isfile(lsi_filename): lsi = LsiModel(corpus, id2word=dictionary, num_topics=5) # initialize an LSI transformation, 5 topicos # lsi.save(lsi_filename) # same for tfidf, lda, ... else: lsi = LsiModel.load(lsi_filename) lsi_topics = 5 # numero predefinido de topicos def print_topic(lsi, topicno, topn=7): """ Return a single topic as a formatted string. See `show_topic()` for parameters. >>> lsimodel.print_topic(topicno, topn) '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' """ return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)])