def run_on_time_period(start, stop): # create lists to hold data start_date = start stop_date = stop date_list = [] raw_docs = [] # run the data getter while start_date <= stop_date: timestamp = start_date.replace(tzinfo=timezone.utc).timestamp() doc = get_data(timestamp) raw_docs.append(doc) real_date = start_date - timedelta(days=1) date_list.append(real_date.date()) start_date += timedelta(days=1) # make list of docs without name for i in range(len(raw_docs)): for name in nicknames: if name in raw_docs[i]: raw_docs[i] = raw_docs[i].replace(name, '') final_docs = preprocess(raw_docs) dict, doc_term_matrix = create_corpus(final_docs) # lsi_models, coherence_values = get_coherence_values(dict, doc_term_matrix, final_docs, 10, 1, 2) lsi_model = LsiModel(doc_term_matrix, num_topics=10, id2word=dict) counter = 1 print(lsi_model.print_topics(num_topics=5, num_words=5))
def build_lsi(docs): ''' build lsi model from beginning the documents that needs to extract topics ''' logging.info('There are {} documents'.format(docs.count())) # copy the iterator # build the dictionary logging.info('Building the dictionary...') dictionary = Dict.build_dict(docs) corpus = [i for i in get_corpus(dictionary)] # freeze all the corpus logging.info('number of corpus {}'.format(len(corpus))) logging.info('Construction Completed.') # build the tfidf model logging.info('Building the tfidf model...') tfidf_model = TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf_model[corpus] logging.info('Construction Completed.') # build the lsi model logging.info('Building the LSI model...') lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi_model[corpus_tfidf] logging.info('Construction Complete.') lsi_model.show_topics() return
def create_gensim_lsa_model(doc_clean, number_of_topics, words): dictionary, doc_term_matrix = prepare_corpus(doc_clean) lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) return (lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
def train(self, data): """ Fit LSA model to the data, set document topic vectors and calculate distances. :param data: Data to fit model on """ if self.word_dict == None: print( "Dictionary must be assigned to model before training. This function call does nothing" ) return if self.model == None: self.model = LsiModel(num_topics=self.vector_length, id2word=self.word_dict) self.name = '%s_%strain' % (self.name, data.name) self.path = Path('modelfiles/%s/%s' % (data.name, self.name)) try: self.model = LsiModel.load(str(self.path / '.model')) except: self.path.mkdir(parents=True, exist_ok=True) print("Training model...", end='') time.sleep(0.1) datastream = GetBow(data, self.remove_stopwords, self.word_dict) self.model.add_documents(datastream) self.model.save(str(self.path / '.model'))
def _fit_lsi(self, lsi_skip_first, lsi_params) -> None: import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) from gensim.models import LsiModel from gensim.matutils import Dense2Corpus for i in ["corpus", "num_topics", "id2word", "chunksize", "dtype"]: if i in lsi_params: del lsi_params[i] logger.warning( f"Provided parameter, {i}, for LSI model will not be used") self._lsiModel = LsiModel( corpus=Dense2Corpus( controlled_compute(self.data.blocks[0], self.nthreads).T), num_topics=self.dims + 1, # +1 because first dim will be discarded chunksize=self.data.chunksize[0], id2word={x: x for x in range(self.data.shape[1])}, **lsi_params, ) for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")): if n == 0: continue self._lsiModel.add_documents(Dense2Corpus(i.T)) if lsi_skip_first: self.loadings = self._lsiModel.get_topics().T[:, 1:] else: self.loadings = self._lsiModel.get_topics().T
def train_LSIModel(tokens, num_top): # reuters_text = open("test2.txt", "r") dct = corpora.Dictionary(tokens) document_matrix = [dct.doc2bow(article) for article in tokens] model = LsiModel(document_matrix, num_topics=num_top, id2word=dct) model.save("test2.LSIModel") return model
def create_gensim_lsa_model(doc_clean,number_of_topics,words): prepare_corpus = project2.initialize_terms_and_postings() dictionary,doc_term_matrix = prepare_corpus(doc_clean) lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary) print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) return lsamodel
def create_gensim_lsa_model(doc_clean, number_of_topics, lsa_training=True): """ Input : clean document, number of topics and number of words associated with each topic Purpose: create LSA model using gensim Output : return LSA model """ if lsa_training: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) # generate LSA model lsi_model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary) # train model #coherence_value = CoherenceModel(model=lsi_model, texts=doc_clean, dictionary=dictionary, coherence='c_v').get_coherence() #print("Coherence value : ",coherence_value) print('Saving lsi_model...') lsi_model.save(lsi_model_path) print('lsi_model saved!') corpus_lsi = lsi_model[doc_term_matrix] with open(corupus_lsi_path, 'wb') as handle: pickle.dump(corpus_lsi, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Corpus_lsi saved.') else: dictionary, doc_term_matrix = prepare_corpus(doc_clean, lsa_training) print('Loading lsi_model...') lsi_model = LsiModel.load(lsi_model_path) print('lsi_model Loaded!') corpus_lsi = lsi_model[doc_term_matrix] return lsi_model, corpus_lsi, dictionary
def train(n_topics=num_topics): docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50 ) # save the dictionary with open(os.path.join(folder_path_objects, 'dictionary_lsi_bow'), 'wb') as f: pickle.dump(dictionary, f) # create binary and regular bow corpus corpus_bow = [dictionary.doc2bow(d) for d in docs] corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] # save corpuses with open(os.path.join(folder_path_objects, 'corpus_binary'), 'wb') as f: pickle.dump(corpus_binary, f) # create models print(f'{time.ctime()} Start training LSA (binary bow)') lsi_bin = LsiModel( corpus=corpus_binary, id2word=dictionary, chunksize=1000, num_topics=n_topics ) # save models to disk os.makedirs(folder_path_models, exist_ok=True) lsi_bin.save('./models/lsi_bin_filtered')
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel( corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
class LsiVec(TopicVec): def __init__(self, vec_num): TopicVec.__init__(self, vec_num) def __gen_model(self, corpus): # if self.p_corpus == 'onehot': # model_name = 'lsi_one_hot.model' # else: # model_name = 'lsi_tfidf.model' model_name = 'lsi.model' self.model = LsiModel(corpus, id2word=self.dictionary, num_topics=self.vec_num) self.model.save(os.path.join(self.out_dir, model_name)) def __get_model(self): model_name = 'lsi.model' if os.path.exists(os.path.join(self.out_dir, model_name)): self.model = LsiModel.load(os.path.join(self.out_dir, model_name)) else: raise FileNotFoundError('"{}" file not found!'.format(model_name)) def fit(self, doc, out_dir, use_exist_dictionary=False): TopicVec.fit(self, doc, out_dir, use_exist_dictionary) self.__gen_model(self.corpus)
def trainModel(self): if self.toweight: self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]]) else: self.model = LsiModel(self.corpus, num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.corpus])
def train_models(): models = dict() if settings["models"]["msda"]: dims = settings["dimensionalities"]["msda"] try: msda = mSDA.load("reuters_msda_%sdims" % dims) # the line below is for testing a model I have locally on my machine #msda = mSDA.load("persist/mSDA/mSDA_wiki_dim-1000_stem-False_tfidf-False_noise-0.5_num_layers-3") except: ln.info("Training mSDA...") prototype_ids = [id_ for id_, freq in sorted(dictionary.dfs.items(), key=lambda (k, v): v, reverse=True)[:dims]] msda = mSDA(0.5, 5, len(dictionary), dims, prototype_ids=prototype_ids) msda.train(bow_corpus()) msda.save("reuters_msda_%sdims" % dims) msda.__out_size = dims models["msda"] = msda if settings["models"]["lsi"]: dims = settings["dimensionalities"]["lsi"] try: lsi = LsiModel.load("reuters_lsi_%sdims" % dims) except: ln.info("Training LSI...") lsi = LsiModel(corpus=bow_corpus(), num_topics=dims, id2word=dictionary) lsi.save("reuters_lsi_%sdims" % dims) lsi.__out_size = dims models["lsi"] = lsi return models
def get_topic(text): np.random.seed(100) nlp = spacy.load('en') my_stop_words = [ u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get' ] for stopword in my_stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True doc = nlp(text) article = [] texts = [] for w in doc: # if it's not a stop word or punctuation mark, add it to our article! if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num: # we add the lematized version of the word article.append(w.lemma_) texts.append(article) # getting bigrams out of words using gensim bigram = gensim.models.Phrases(texts) texts = [bigram[line] for line in texts] # Creating corpus with our words dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(i) for i in texts] # Applying LDA and LSI models lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary) ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary) lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)] ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)] topics = [] for i in ldatopics: topics.append(i[0]) tags = nltk.pos_tag(topics) # removing verbs as generally nouns are topics lfinaltopics = [ word for word, pos in tags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB' ] ldafinaltopics = list(set(lfinaltopics)) lstopics = [] for i in lsitopics: for j in i: lstopics.append(j) ltags = nltk.pos_tag(lstopics) lsifinaltopics = [ word for word, pos in ltags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ' ] # Intersection of results from both models finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics)) final_topics = [] for i in finaltopics: if len(i) >= 2: final_topics.append(i) return final_topics
def build_lsi_model(dictionary, corpus, should_rebuild): lsi = list() # DEBUG should_rebuild = True if not should_rebuild: try: print('Loading LSI Model backup...') lsi_file = utils.get_file_path(cfg.LDA_BACKUP) print('LSI file = {}'.format(lsi_file)) lsi = LdaModel.load(lsi_file) except Exception as exc: utils.print_exception_details('Building LSI Model', exc) else: print('Building LSI Model...') one_pass = cfg.NUM_PASSES > 1 lsi = LsiModel(corpus, id2word=dictionary, num_topics=cfg.NUM_TOPICS, onepass=one_pass) print('Done!') # Save Model Structures LSI_FILE = utils.get_file_path(cfg.LSI_BACKUP) lsi.save(LSI_FILE) return lsi
def train_model(self, num_topics): corpus = self.get_corpus() model = LsiModel(corpus, num_topics=num_topics) tmp_fname = self.path + self.model_type + "_model" model.save(tmp_fname) return model
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
class LatentSemanticIndexing(): """ This class implements Latent semantic indexing using the genims library. """ def __init__(self, corpus, embedding="bow", num_topics=500, chunksize=20000): self.lsi_model_path = "./saved_models/gensim-lsi-{}-model-nt-{}.mm".format(embedding, num_topics) self.lsi_corpus_path = "./saved_models/gensim-{}-lsi-nt-{}-corpus.crp".format(embedding, num_topics) self.sim_matrix_path = "./saved_models/sim-matrix-{}-{}.mm".format(embedding, num_topics) self.sim_matrix_temp_path = "./saved_models/sim_temps/sim_temp-{}-{}.tmp".format(embedding, num_topics) self.embedding = embedding self.corpus = corpus self.num_topics = num_topics if os.path.exists(self.lsi_model_path): print("LSI {} model already trained, loading from disk.".format(embedding)) self.model = LsiModel.load(self.lsi_model_path) else: # Make a index to word dictionary. temp = corpus.dictionary[0] # This is only to "load" the dictionary. id2word = corpus.dictionary.id2token print("Training LSI model.") self.model = LsiModel( corpus=list(corpus.get_corpus()), id2word=id2word, chunksize=chunksize, num_topics=num_topics ) print("Saving LSI model.") self.model.save(self.lsi_model_path) self.lsi_corpus = ModelCorpus(corpus.get_corpus(), self.model, path=self.lsi_corpus_path) if os.path.exists(self.sim_matrix_path): print("Similarities matrix {} model already trained, loading from disk.".format(embedding)) self.index = similarities.Similarity.load(self.sim_matrix_path) else: print("Creating similarities index.") Path(self.sim_matrix_temp_path).touch(exist_ok=True) self.index = similarities.Similarity(self.sim_matrix_temp_path, self.lsi_corpus, num_features=self.num_topics) self.index.save(self.sim_matrix_path) def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) if self.embedding == "bow": lsi_query = self.model[vec_query] elif self.embedding == "tfidf": lsi_query = self.model[self.corpus.tfidf_model[vec_query]] sims = self.index[lsi_query] sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1]) return sims
def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False): """as in [VISR12: 4.2.1]""" # TODO options here: # * if it should filter AFTER the LSI if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") filtered_dcm.add_pseudo_keyworddocs() dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())]) print("Start creating the LSA-Model with MORE topics than terms...") lsamodel_manytopics = LsiModel(doc_term_matrix, num_topics=len(all_terms) * 2, id2word=dictionary) print("Start creating the LSA-Model with FEWER topics than terms...") lsamodel_lesstopics = LsiModel(filtered_dcm.dtm, num_topics=len(filtered_dcm.all_terms) // 10, id2word=dictionary) print() import matplotlib.cm import matplotlib.pyplot as plt # TODO use the mpl_tools here as well to also save plot! plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200], vmin=lsamodel_lesstopics.get_topics().min(), vmax=lsamodel_lesstopics.get_topics().max(), cmap=matplotlib.cm.get_cmap("coolwarm")) plt.show()
def lsi(clean_docs, model_name, topics): from gensim import corpora # turn all data into a dictionary mappping of normalized words and their integer ids dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LSI model from gensim.models import LsiModel num_topics = topics # find this number of topics in the data lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary) lsimodel.save('lsi_model_' + model_name + '.gensim') topics = lsimodel.print_topics(num_words=5) for topic in topics: print(topic)
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def lsi(dataframe, num_topics=300): """Returns an LSI model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. num_topics : int (default is 300) The number of topics to train the LSI model with. Returns ------- model : Gensim LsiModel LSI model for documents stored in the DataFrame. """ filename = 'caches/models/lsi.model' if not os.path.isfile(filename): dictionary = dictionary_corpus(dataframe) bow = bow_corpus(dataframe) tfidf_model = tfidf(dataframe) tfidf_corpus = tfidf_model[bow] lsi_model = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=num_topics) lsi_model.save(filename) else: lsi_model = LsiModel.load(filename) return lsi_model
def lsi(corpus, dictionary): lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_similarity_matrix = MatrixSimilarity(lsi_corpus) print(lsi_model.show_topics()) return lsi_similarity_matrix
class LSI(): @timed def init_lsi(self, **kwargs): # handle onepass=False model_sparse = self.sparse class __sparse(): def __iter__(self): for a in model_sparse: yield [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)] self.lsi = LsiModel(__sparse(), **kwargs) self.lsi.save(self.path + 'lsi.pkl') def load_lsi(self): self.lsi = LsiModel.load(self.path + 'lsi.pkl') def load_dense(self, storage='disk'): self.dense = sorbet(self.path + 'dense', kind=storage).load() def sparse_to_dense(self, sparse): dense = self.lsi[sparse] dense = sparse2full(dense, self.lsi.num_topics) dense = array('f', dense) return dense @timed def init_dense(self, storage=None, workers=None): _workers = workers or self.params.get( 'dense__workers') or self.params.get('workers', 1) _storage = storage or self.params.get( 'dense__storage') or self.params.get('storage', 'disk') if _workers > 1: self._init_dense_mp(workers=_workers, storage=_storage) else: self._init_dense_sp(storage=_storage) def _init_dense_sp(self, storage='disk'): self.dense = sorbet(self.path + 'dense', kind=storage).new() for a in self.sparse: sparse = [(int(a[i]), a[i + 1]) for i in range(0, len(a), 2)] dense = self.sparse_to_dense(sparse) self.dense.append(dense) self.dense.save() def _init_dense_mp(self, workers, storage): chunksize = self.params.get('dense__chunksize', 10) s = sorbet(self.path + 'dense').new() id_iter = range(len(self.meta)) id_iter = tqdm(id_iter, 'dense', len(self.meta)) with mp.Pool(workers, init_dense_worker, [ self.path, ]) as pool: dense = pool.imap(dense_worker, id_iter, chunksize) for d in dense: s.append(d) self.dense = s.save()
def train(text_corpus_file, dict_file): """train lsi model from text corpus""" gutenberg_corpus = TextCorpus(text_corpus_file) dict = Dictionary.load(dict_file) lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400) lsi.save(model_file) print lsi.projection.u print lsi.projection.u.size print lsi.projection.u[0].size
def save_lsi_model(corpus_tfidf, dictionary): # apply transformation to whole corpus print("lsi model") lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=3000) # initialize LSI transformation tmp_fname = get_tmpfile("lsi.model") print("saving tmp file") lsi.save(tmp_fname) return tmp_fname
def __gen_model(self, corpus): # if self.p_corpus == 'onehot': # model_name = 'lsi_one_hot.model' # else: # model_name = 'lsi_tfidf.model' model_name = 'lsi.model' self.model = LsiModel(corpus, id2word=self.dictionary, num_topics=self.vec_num) self.model.save(os.path.join(self.out_dir, model_name))
def fit_model(self, corpus: List): """ This method creates the model, using Gensim Latent Semantic Analysis. The model isn't then returned, but gets stored in the 'model' class attribute. """ dictionary = Dictionary(corpus) word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus] self.model = LsiModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters)
def lsi(all_tokens_lists): dictionary = corpora.Dictionary(all_tokens_lists) corpus = [dictionary.doc2bow(text) for text in all_tokens_lists] tfidf = models.TfidfModel(corpus, smartirs='ntc') tfidf_model = tfidf[corpus] lsi_model = LsiModel(corpus=tfidf_model, id2word=dictionary, num_topics=7, decay=0.5) pprint(lsi_model.print_topics(-1, 10))
def create_lsi(num_topic, dictionary): corpus, dic = generate_corpus(dictionary) print("__________________________Create LSI_________________________") lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dic) topics = lsimodel.print_topics(num_topic) # Showing only the top 5 topics # see list of topics for topic in topics: print(topic) return lsimodel
def getLsiModel(tfidfModel) -> LsiModel: modelPath = os.path.join('.cache', 'lsi.gensim_model') try: lsiModel = LsiModel.load(modelPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) lsiModel = LsiModel(corpus, num_topics=200) lsiModel.save(modelPath) return lsiModel
def latent_semantic_indexing(corpus, num_topics, id2word): ''' LATENT SEMANTIC INDEXING # Advantage of LSI: ranks topics by itself. Outputs topics in a ranked order. # Requires a num_topics parameter (200 by default) to determine the number of latent dimensions after the SVD. ''' print 'Latent Semantic Indexing' lsi_model = LsiModel(corpus = corpus, num_topics = num_topics, id2word = id2word) lsi_model.show_topics(num_topics = num_topics) lsi_topic = lsi_model.show_topics(formatted = False) return lsi_model
def topicsLSI(self, num_topics=10, num_words=10): # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100) lsi = LsiModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word) # show_topics(num_topics=-1, num_words=10, log=False, formatted=True) # Return num_topics most significant topics (return all by default). # For each topic, show num_words most significant words (10 words by default). # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False. # If log is True, also output this result to log. return lsi.show_topics(num_words=num_words, formatted=False)
def lsi_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lsi): # 使用lsi模型,获取主题分布 lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20) f_keyword = open(cluster_keyword_lsi, 'w+',encoding='utf-8') for topic in lsi.print_topics(20, 20): print(topic[0]) words = [] for word in topic[1].split('+'): word = word.split('*')[1].replace(' ', '') words.append(word) f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') return lsi
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join(HERE, "sogou.dict")) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, "sogou.model")) lsi_model.save(os.path.join(HERE, "sogou.lsi")) print "save dictionary and tfidf model" """
class LSITransformation: def __init__(self, input_space_vectors_map): self.input_space_vectors = input_space_vectors_map.values() self.transform() def transform(self): self.space = LSISpace(self.input_space_vectors) #TODO Handle Saner Reduction self.reduced_space = 15 input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors] self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word()) return self.lsi_model def dissimilarity_score(self, tokens, other_tokens): bows = self.space.doc2bow(tokens) other_bows = self.space.doc2bow(other_tokens) vector = self.infer_and_vectorize(bows) other_vector = self.infer_and_vectorize(other_bows) similarity = CosineSimilarity().calculate(vector, other_vector) return 1 - similarity def infer_and_vectorize(self, bows): transformed_bow = defaultdict(float) transformed_bow.update(dict(self.lsi_model[bows])) return [transformed_bow[dimension] for dimension in range(0, self.reduced_space)] def print_transformation(self): topics = self.lsi_model.show_topics(num_words=self.space.length(), formatted=False) for topic in topics: print [(round(value, 4), token) for value, token in topic]
def transform(self): self.space = LSISpace(self.input_space_vectors) #TODO Handle Saner Reduction self.reduced_space = 15 input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors] self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word()) return self.lsi_model
def __init__(self, dict_path, model_path): """Load an LSA space from a file. :dict_path: path to the dictionary file. :model_path: path to the model file. """ self._dictionary = Dictionary.load_from_text(dict_path) self._lsi_model = LsiModel.load(model_path)
def build(self): ### need to find out a way to pick the proper number of the cluster - may be based on the number of POST self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3) self.corpus_lsi = self.lsi_model[self.corpus_tfidf] ##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4) #print "topics difference" #print self.lsi_model.print_topic(2, topn=4) self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False)
def load_corpus(): dictionary = corpora.Dictionary.load(os.path.join(HERE, "sogou.dict")) tfidf_model = tfidfmodel.TfidfModel.load(os.path.join(HERE, "sogou.model")) lsi_model = LsiModel.load(os.path.join(HERE, "sogou.lsi")) try: sg_class = joblib.load(os.path.join(HERE, "sgdc_clf.pkl")) except: sg_class = None return dictionary, tfidf_model, lsi_model, sg_class
def fit(self, raw_documents, y=None): self.analyzer_func = self.build_analyzer() self.model = LsiModel.load(self.model_fn) if os.path.exists(self.model_fn + '.tfidf'): self.tfidf = TfidfModel.load(self.model_fn + '.tfidf') return self
def create_lsi_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lsi.gz' if not os.path.exists(model_fname) or force: model = LsiModel(corpus=corpus, id2word=id2word, num_topics=project.num_topics, ) if corpus: model.save(model_fname) else: model = LsiModel.load(model_fname) return model, model_fname
def algorithm_lsi(self, category_id, objs, goldstandards): numTopics = self.calculate_k_using_firstnames(objs) print "Using k = "+str(numTopics) texts = [] for obj in objs: texts.append(get_categorizedproduct_content(obj)) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # bag of words print "Create models" lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=numTopics) corpus_lsi = lsi_model[corpus] print "Done creating models" results = [] labels = [] cont = 0 for probabilities, obj in izip(corpus_lsi, objs): if probabilities: max_prop = max(probabilities, key=lambda item:item[1])[0] else: max_prop = "WARNING "+str(texts[cont]) labels.append(max_prop) results.append(str(max_prop)+" # "+obj['name'].encode('utf8')) cont += 1 results.sort() for r in results: print r topic_id = 0 for topic in lsi_model.show_topics(num_words=5): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic topic_id+=1 if numTopics > 1: self.calculate_metrics(category_id, objs, labels, goldstandards) else: print "number of clusters equals or lower than 1, ignoring metric"
def load_corpus(self, corpus_name): ''' This is were we load the corpus files. This needs to be moved to a more general class initialization. (FIXME Freija) ''' corpusfile = corpus_name + '.mm' corpusdict = corpus_name + '_wordids.txt' lsimodel = corpus_name + '.lsi_model' lsiindex = corpus_name + '-lsi.index' self.corpus_name = corpus_name self.corpus_mm = MmCorpus(corpusfile) self.corpus_dict = Dictionary.load_from_text(corpusdict) self.model = LsiModel.load(lsimodel) self.index = similarities.MatrixSimilarity.load(lsiindex)
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def export_model(model_file, out_file): """Saves the model. The output will be utf-8 encoded.""" # model = model_mapping[model_type].load(model_file) model = LsiModel.load(model_file) with FileWriter(out_file, "w").open() as out: out.write(u"{0}\t{1}\n".format(model.numTerms, model.numTopics)) for term in xrange(model.numTerms): word = model.id2word.id2token[term].decode("utf-8") while len(word) > 0 and not word[-1].isalnum(): word = word[0:-1] out.write(u"{0}\n".format(word)) out.write( u"{0}\n".format(u"\t".join(str(f) for f in numpy.asarray(model.projection.u.T[:, term]).flatten())) )
elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary) # Log Entropy weights frequencies of all document features in the corpus tokenize_func = wikicorpus.tokenize # The tokenizer used to create the Wikipedia corpus document = "Some text to be transformed." bow_document = dictionary.doc2bow(tokenize_func(document)) # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus. logent_document = logent_transformation[[bow_document]] # converts a single document to log entropy representation. document must be in the same vector space as corpus. documents = ["Some iterable", "containing multiple", "documents", "..."] bow_documents = (dictionary.doc2bow(tokenize_func(document)) for document in documents) # use a generator expression because... logent_documents = logent_transformation[bow_documents] # ...transformation is done during iteration of documents using generators, so this uses constant memory ### Chained transformations logent_corpus = MmCorpus(corpus=logent_transformation[bow_corpus], id2word=dictionary) # builds corpus from iterating over documents of bow_corpus as transformed to log entropy representation. Will also take many hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_corpus, id2word=dictionary, num_features=400) # creates LSI transformation model from log entropy corpus representation. Takes several hours with Wikipedia corpus. lsi_transformation = LsiModel(corpus=logent_transformation[bow_corpus], id2word=dictionary, num_features=400) # Performs same operation as above, but with implicit chaining # Can persist transformation models, too. logent_transformation.save("logent.model") lsi_transformation.save("lsi.model") ### Similarities (the best part) from gensim.similarities import Similarity documents = ["A bear walked in the dark forest.", "Tall trees have many more leaves than short bushes.", "A starship may someday travel across vast reaches of space to other stars.", "Difference is the concept of how two or more entities are not the same."]
from gensim.matutils import cossim from gensim.models import LsiModel logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model") parser.add_argument("-d", "--data", help="path to training.tsv") args = parser.parse_args() # Load model # Note - model contains dictionary that intentionally omits stopwords model = LsiModel.load(args.model, mmap='r') # Load 'training' data training_data = open(args.data) training_data.readline() # advance past header line correct = 0 total = 0 for line in training_data: elements = line.split("\t") question_id = elements.pop(0) correct_answer = elements.pop(1) # Get bag-of-words representation of question and answers doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements]
corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf) # TRAINING # lsa model if not os.path.exists(f_lsa): lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim) lsa.save(f_lsa) # word2vec model class MyCorpus(): def __iter__(self): for d in corpus.get_texts(): yield [w for w in d if w in corpus.dictionary.token2id] if not os.path.exists(f_w2v): w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5) w2v.save_word2vec_format(f_w2v, binary=True) # LANGUAGE MODELS lm_cache = models.Cache(window=50) lm_lsa = models.LSA(f_lsa, f_dict, tfidf=f_tfidf, window=50) lm_w2v = models.Word2Vec(f_w2v, window=50)
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries) # extract num_topics LSI topics; use the default one-pass algorithm num_topics = 400 model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics) # print the most contributing words (both positively and negatively) for each of the first ten topics model.print_topics(10) model.save("%s/%s.model" % (args.model, timestamp))
tfidf_model = TfidfModel(corpus) tfidf_model.save("wiki_en_tfidf.model") # lsi_model = LsiModel(corpus) # topic_id = 0 # for topic in lsi_model.show_topics(): # topic_id+=1 # print "TOPIC (LSI) " + str(topic_id) + " : " + topic # lsi_model.print_topic(20, topn=10) # corpus_lsi = lsi_model[corpus] corpus_tfidf = tfidf_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) # corpus_lsi_2 = lsi_model_2[corpus] print "Done creating models" lsi_model_2.save("wiki_en_model.lsi") # lsi_model_2 .print_topics(5) """ topic_id = 0 for topic in lsi_model_2.show_topics(): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic #group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id] > 0.5] group_topic = [doc for doc in corpus_lsi_2] print str(group_topic) topic_id+=1
self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir)) self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params def __iter__(self): for tokens in iter_documents(self.top_dir): yield self.dictionary.doc2bow(tokens) corpus = MyCorpus(test_data_dir) # create a dictionary for vector in corpus: # convert each document to a bag-of-word vector print vector topics = 200 num_clusters = 4 print "Create models" lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics) corpus_lsi = lsi_model[corpus] print "Done creating models" #lsi_model_2 .print_topics(5) topic_id = 0 for topic in lsi_model.show_topics(num_words=5): print "TOPIC (LSI2) " + str(topic_id) + " : " + topic topic_id+=1 #for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly # print "Doc " + str(doc)
corpus_filename = 'deerwester.mm' if not os.path.isfile(corpus_filename): # convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(review) for review in abstract_vectors] corpora.MmCorpus.serialize(corpus_filename, corpus) else: corpus = corpora.MmCorpus(corpus_filename) # vamos a utilizar Latent semantic indexing para tratar categorizar los abstracts print("lsi") lsi_filename = 'model.lsi' if not os.path.isfile(lsi_filename): lsi = LsiModel(corpus, id2word=dictionary, num_topics=5) # initialize an LSI transformation, 5 topicos # lsi.save(lsi_filename) # same for tfidf, lda, ... else: lsi = LsiModel.load(lsi_filename) lsi_topics = 5 # numero predefinido de topicos def print_topic(lsi, topicno, topn=7): """ Return a single topic as a formatted string. See `show_topic()` for parameters. >>> lsimodel.print_topic(topicno, topn) '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"' """ return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in show_topic(lsi, topicno, topn)])
tfidf[bows[0]] # In[19]: dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]]) # Notice how "you" didn't get as much weight as "enjoy" # Let's look at some other tweets # In[9]: from gensim.models import LsiModel lsi = LsiModel.load('../../data/lsi100') len(lsi.id2word) # This is starting to look a lot like a set of vectors that we could use as features # But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have? # In[ ]: len(vocab) # 100k dimensions isn't a good idea # Even for a masively parallel deep learning project this would be big # Like the cat/dog picture classification on 256x256 images # What about PCA (Principal Component Analysis) like is used on images?
# 将文档(a list of words) 转换成 bag-of-words format = list of `(token_id, token_count)` 2-tuples. # 可通过参数allow_update来设置对模型的更新或只读 corpus = MyCorpus(test_data_dir) # 创建一个字典 for vector in corpus: # 每个文档转换成 a bag-of-word vector后的输出 print (vector) break print ("创建模型") tfidf_model = TfidfModel(corpus)#转换成局部/全局加权TF_IDF矩阵,它可以将一个简单的计数表示成TFIDF空间。 # tfidf = TfidfModel(corpus) # print(tfidf[some_doc])#输出模型 # tfidf.save('/tmp/foo.tfidf_model')#保存模型 lsi_model = LsiModel(corpus) #LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index), #是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents), #并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。 #同义词和多义词如何导致传统向量空间模型检索精确度的下降。 #LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。 topic_id = 0 for topic in lsi_model.show_topics(): topic_id+=1 print ("TOPIC (LSI) " + str(topic_id) + " : ", topic) print('#'*50) print(lsi_model.num_topics) for i in range(0, lsi_model.num_topics-1): if lsi_model.print_topic(i):
class LSA(object): def __init__(self, stopwords, ignorechars): #self.stopwords = stopwords self.ignorechars = ignorechars self.wdict = {} self.dcount = 0 def createStopwords(self, stopword_path): with open(stopword_path, 'r') as file1: temp = file1.read() self.stopwords = temp.split() def parse_dic_bow(self, seg_post): self.posts = [post for post in seg_post.values()] logger.info("BOW process... ") print "original post:" logger.debug("original post:") logger.debug(self.posts) #print self.posts self.mergeLineForOnePost = [" ".join(post) for post in self.posts] #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw'] #print self.mergeLineForOnePost #self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']] ## covert UTF to ASCII self.texts = [[word.encode('utf8') for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']] print "self.mergeLineForOnePost: " self.dictionary = gensim.corpora.Dictionary(self.texts) self.postIdList = [str(postId) for postId in seg_post.keys()] logger.debug("original dic and list:") logger.debug(self.dictionary, len(self.dictionary), self.postIdList) print "original dic and list:" print self.dictionary, self.postIdList ### preprocess - remove the once-word, stopwords, other shits stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords if stopword in self.dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1] ### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster) #self.dictionary.filter_tokens(once_ids) self.dictionary.filter_tokens(stop_ids) logger.info("removed once-words and stopwords......") logger.debug(self.dictionary, len(self.dictionary)) print "removed once-words and stopwords......" print self.dictionary self.dictionary.compactify() self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts] #self.new_vec = self.dictionary.doc2bow(post for post in self.coverts) def store(self): logger.info("store process starts") self.dictionary.save(testDictionary) self.dictionary.save_as_text(testDictionaryString) corpora.MmCorpus.serialize(testBOWCorpus, self.new_vec) # store to disk, for later use #corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store #dictionary = corpora.Dictionary.load(testDictionary) # comes from the store def TFIDF(self): logger.info("TFIDF process starts") self.tfidf = TfidfModel(self.new_vec) self.corpus_tfidf = self.tfidf[self.new_vec] def printInfo(self): print 'show Dic: ' print self.dictionary print 'show BOW: ' for bow in self.new_vec: print bow print 'show corpus_tfidf model: ' print self.tfidf print "show corpus_tfidf: " for i in self.corpus_tfidf: print i print "show LSA assignment of each post: " #self.num = len(self.corpus_lsi) #for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly for doc, postId in zip(self.corpus_lsi,self.postIdList): templist = [] print 'post: {0}'.format(postId) print doc #print "breakdown" #for each in doc: # templist.append(abs(each[1])) #print "templist: " #print templist theLarge = nlargest(1, doc, key=lambda e:abs(e[1])) ## 1 means find the largest one if theLarge: print "the largest one with absoule value: ", theLarge[0][0] else: print "cannot find it!!!!" print "LSA Topics : " print self.topics print "Break down : " for i in self.topics: print i print type(i) def build(self): ### need to find out a way to pick the proper number of the cluster - may be based on the number of POST self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3) self.corpus_lsi = self.lsi_model[self.corpus_tfidf] ##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4) #print "topics difference" #print self.lsi_model.print_topic(2, topn=4) self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False) #print "tuple!@!" #print ss def repaserForOutput(self): ### post_assignment = {post_id:topic} Ex. {"p1":"t1"} ### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"] #print "start to extact info for post_assignment" self.post_assignment = {} self.topic_assignment = {} for doc, postId in zip(self.corpus_lsi,self.postIdList): #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8'] theTopic = nlargest(1, doc, key=lambda e:abs(e[1])) if theTopic: self.post_assignment[postId] = theTopic[0][0] else: self.post_assignment[postId] = "NB" #self.post_assignment[postId] = theTopic[0] self.num = len(self.topics) for topic, num in zip(self.topics, range(self.num)): topicWords = [] for each in topic: #covert from string to unicode topicWords.append(each[1].decode('utf8')) #topicWords.append(each[1]) ## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"] #self.topic_assignment[str(num)] = topicWords[0] self.topic_assignment[str(num)] = topicWords #matchObj = re.match( r'(.*) are(\.*)', line) #rerurn(self.post_assignment,self.topic_assignment) return (self.post_assignment,self.topic_assignment) def create_result(self,seg_post): logger.info('LSA main process starts.....') self.createStopwords(stopword_path) self.parse_dic_bow(seg_post) self.TFIDF() self.build() self.store() def get_result(self): self.printInfo() return (self.repaserForOutput())
pd.set_option('display.max_columns', 500) pd.set_option('display.width', 800) pd.set_option('precision', 2) get_ipython().magic(u'precision 4') get_ipython().magic(u'pprint') # In[3]: from sklearn.linear_model import SGDRegressor from sklearn.svm import SVR # In[6]: lsi = LsiModel.load(os.path.join(DATA_PATH, 'lsi100')) lsi2 = LsiModel.load(os.path.join(DATA_PATH, 'lsi2')) # In[7]: with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'rb') as f: topics = pd.DataFrame.from_csv(f, encoding='utf8') topics = topics.fillna(0) # In[8]: dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python') nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')