def build_tfidf_or_lsi(corpus, method='tfidf'): ''' построение модели для ранжирования документов. На вход: корпус текстов и метод ("tfidf" или "lsi"). На выход кортеж: (словарь терминов в корпусе текстов, оцененная модель и матрица сходств слов) ''' dictionary = Dictionary(corpus) corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] model_tfidf = TfidfModel(corpus_bow) corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow] simil_tfidf = MatrixSimilarity(corpus_tfidf) if method == 'tfidf': return dictionary, model_tfidf, simil_tfidf elif method == 'lsi': model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) corpus_lsi = [model_lsi[doc] for doc in corpus_bow] simil_lsi = MatrixSimilarity(corpus_lsi) return dictionary, model_lsi, simil_lsi
def trainModel(self): if self.toweight: self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]]) else: self.model = LsiModel(self.corpus, num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.corpus])
def train_model(self): """ Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity :return: status of training """ try: data = pd.read_csv(self.processed_data) del data['Unnamed: 0'] # creating tokens for the doc column corpus = data['doc'].map(break_to_tokens) # creating dictionary of words in the movie dataset dictionary = gensim.corpora.Dictionary(corpus) dictionary.save(self.corpus_dictionary) # creating vector with bag of words for the corpus vector = [dictionary.doc2bow(d) for d in corpus] # creating tfidf values for the vector tfidf = models.TfidfModel(vector) tfidf.save(self.tfidf_model) corpus_tfidf = tfidf[vector] # Compute Similarities similarity = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary)) similarity.save(self.matrix_similarity) return "Model Trained Successfully" except: return "Error While Training Model"
def calAuthorSim(): conn = sqlite3.connect(config.db_path) db = conn.cursor() model = AuthorTopicModel.load(config.author_model128_path) poets = list(model.id2author.values()) print(len(poets)) # vec = model.get_author_topics('苏轼') index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30) index.save(config.author_simMatrix_path) # index = MatrixSimilarity.load(config.author_simMatrix_path) for name in poets: # print(name) sims = index[model[name]] sims = sorted(sims, key=lambda item: -item[1]) sims = [ [poets[sim[0]] , sim[1]] for sim in sims] # print(sims) # sql_comment = "UPDATE author SET sims=? WHERE id=?" # db.execute(sql_comment, (toJson(sims), name)) sql_comment = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name) db.execute(sql_comment) # print(sql_comment) # print(len(poets)) conn.commit()
def train(self, classdict, nb_topics, *args, **kwargs): """ Train the topic modeler. :param classdict: training data :param nb_topics: number of latent topics :param args: arguments to pass to the `train` method for gensim topic models :param kwargs: arguments to pass to the `train` method for gensim topic models :return: None :type classdict: dict :type nb_topics: int """ self.nb_topics = nb_topics self.generate_corpus(classdict) if self.toweigh: self.tfidf = TfidfModel(self.corpus) normcorpus = self.tfidf[self.corpus] else: self.tfidf = None normcorpus = self.corpus self.topicmodel = gensim_topic_model_dict[self.algorithm]( normcorpus, num_topics=self.nb_topics, *args, **kwargs) self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) # change the flag self.trained = True
def compute(self): vec_texts = [text.split() for text in self.texts] write("\n "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else '' dictionary = Dictionary(vec_texts) write("[OK]") if self.debug else '' write("\n "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' corpus = [dictionary.doc2bow(vec) for vec in vec_texts] write("[OK]") if self.debug else '' write("\n "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' tfidf_space = TfidfModel(corpus) tfidf_corpus = tfidf_space[corpus] if self.method == 'TFIDF': self.space = tfidf_space self.index = MatrixSimilarity(tfidf_corpus) elif self.method == 'LSI': self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'RP': self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'LDA': self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) self.dictionary = dictionary write("[OK]\n") if self.debug else ''
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidfmodel = TfidfModel(corpus) index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary)) index.save(matrix_name) tfidfmodel.save(model_name) dictionary.save(dic_name) return tfidfmodel, index, dictionary
def main(self): print("Recommendation using TF_IDF") # Loading preprocessed data vagas_ti = pd.read_csv(self.dataPrepFile) vagas_ids = pickle.load( open(self.out + "preprocessing/vagas_ids.array", "rb")) vagas_words = pickle.load( open(self.out + "preprocessing/vagas_words.list", "rb")) cvs_words = pickle.load( open(self.out + "preprocessing/cvs_words.series", "rb")) cvs = pd.read_csv(self.dataCvsFile) cvs = cvs.fillna("") cvs.isnull().any() #print("Loading cvs done!") # Creating a dictionary dictionary = gcorp.Dictionary(vagas_words) dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict' ) # store the dictionary, for future reference # compile corpus (vectors number of times each elements appears) raw_corpus = [dictionary.doc2bow(v) for v in vagas_words] gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm', raw_corpus) # store to disk print("Tamanho do dicionário: " + str(len(dictionary))) # STEP 2 : similarity between corpuses dictionary = gcorp.Dictionary.load(self.out + 'preprocessing/tf_idf/vagas.dict') corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm') # Transform Text with TF-IDF tfidf = gsm.TfidfModel(corpus) # step 1 -- initialize a model # corpus tf-idf corpus_tfidf = tfidf[corpus] # STEP 3 : Create similarity matrix of all files index = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary), num_best=10) index.save(self.out + 'preprocessing/tf_idf/vagas.index') index = MatrixSimilarity.load(self.out + 'preprocessing/tf_idf/vagas.index') self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words, dictionary, tfidf, index) print("Recommendation using TF_IDF done!")
def custom_queries(corpus, dictionary, paragraphs): # tfidf query: tfidf_model = TfidfModel(corpus, dictionary=dictionary) query = process_query("What is the function of money?", dictionary) tfidf_query = tfidf_model[query] tfidf_corpus = [] for i in range(len(corpus)): tfidf_corpus.append(tfidf_model[corpus[i]]) tfidf_index = MatrixSimilarity(tfidf_corpus) print("tfidf query:") doc2similarity_tfidf = enumerate(tfidf_index[tfidf_query]) for tfidf_index, similarity in sorted(doc2similarity_tfidf, key=lambda kv: -kv[1])[:3]: paragraph = paragraphs[tfidf_index].split("\n") number = tfidf_index + 1 print("[paragraph: " + str(number) + "]") for i in range(5): print(paragraph[i]) if (i + 1) == len(paragraph): break print("\n") # lsi query: lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_query = lsi_model[tfidf_query] lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_index = MatrixSimilarity(lsi_corpus) doc2similarity_lsi = enumerate(lsi_index[lsi_query]) print("lsi query:") for lsi_index, similarity in sorted(doc2similarity_lsi, key=lambda kv: -kv[1])[:3]: paragraph = paragraphs[lsi_index].split("\n") number = lsi_index + 1 print("[paragraph: " + str(number) + "]") for i in range(5): print(paragraph[i]) if (i + 1) == len(paragraph): break print("\n")
def calculate_tfidf_cos_sim(text1, text2, dictionary_tfidf, corpus): tfidf1 = to_tfidf(text1, dictionary_tfidf, corpus) tfidf2 = to_tfidf(text2, dictionary_tfidf, corpus) index = MatrixSimilarity([tfidf1], num_features=len(dictionary_tfidf)) sim = index[tfidf2] return float(sim[0])
def find_similarity_scores(self, topics): # Create similarities container similarities = {'Resumes': {}} # Gensim requires a corpora data structure for transformations and analysis dictionary = corpora.Dictionary(self.corpus) # Convert text to BoW. It already is but lets be sure. corpus_gensim = [dictionary.doc2bow(doc) for doc in self.corpus] # Term Frequency-Inverse Document Frequency (TF-IDF) transformation sets weights small # when they appear more often in the text. self.tfidf = TfidfModel(corpus_gensim) print(self.tfidf) self.tfidf = self.tfidf[corpus_gensim] print(self.tfidf) # Find similarity via vector-space pair-wise cosine angle absolute value via Latent Semantic Indexing (LSI) # https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing lsi = LsiModel(self.tfidf, id2word=dictionary, num_topics=topics) lsi_index = MatrixSimilarity(lsi[self.tfidf]) similarities['Resumes']["LSI_Similarity"] = np.array( [lsi_index[lsi[self.tfidf[i]]] for i in range(len(self.corpus))]) for doc in self.tfidf: for word_id, value in doc: word = dictionary.get(word_id) self.ind_word_scores[word] = value # Convert to numpy arrays self.f_list = np.array(self.f_list) self.data = np.array(self.data) # Return results to object self.sim_matrix = similarities
def get_recommendation(self, movie_title: str): """ Accepts Movie Name and fetches the list of recommended movie names using matrix(cosine) similarity :param movie_title: :return: array of movie names """ print("movie : ", movie_title) dictionary = gensim.corpora.Dictionary.load(self.corpus_dictionary) tfidf_model = gensim.models.TfidfModel.load(self.tfidf_model) similarity = MatrixSimilarity.load(self.matrix_similarity) data = pd.read_csv(self.processed_data) del data['Unnamed: 0'] data["original_title"] = data["original_title"].str.lower() movie = data.loc[data.original_title == movie_title] print(movie) if movie.shape[0] == 0: status = ["Failed to Recommend Movies with existing movie data."] return status else: movie_doc_bow = dictionary.doc2bow( movie['doc'].map(break_to_tokens)[0]) movie_tfidf = tfidf_model[movie_doc_bow] movie_recommendations = pd.DataFrame({ 'Cosine_sim_values': similarity[movie_tfidf], 'title': data.original_title.values }).sort_values(by="Cosine_sim_values", ascending=False) top_recommendations = movie_recommendations['title'].head(11) return top_recommendations.to_numpy()
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000) self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def load(lsi_path=None, id2word_path=None, index_path=None): """ If specified, attempts to load gensim LsiModel from `lsi_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- lsi_path: str File-path designating where self.model should be saved. id2word_path: str File-path designating where self.dictionary should be saved. """ if lsi_path is not None: from gensim.models import LsiModel if not os.path.exists(lsi_path): raise IOError( 'The provided file path to the LsiModel was not found.' 'Please ensure that the argument is the correct path.') return LsiModel.load(lsi_path) if id2word_path is not None: from gensim.corpora.dictionary import Dictionary if not os.path.exists(id2word_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return Dictionary.load(id2word_path) if index_path is not None: from gensim.similarities import MatrixSimilarity if not os.path.exists(index_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return MatrixSimilarity.load(index_path)
def recomended_projects(self, request): projects = ProjectRequest.objects.all() project_keywords_dict = {} projects_dict = {} tags_list = [] for project in projects: description = project.description description_keywords = get_keywords(description.replace('"', '')) tags = project.tags.replace(' ', ',').lower() for keyword in description_keywords: tags += ',' + keyword[0].lower() tags_list.append(tags) df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id']) df['tags'] = tags_list keywords = df['tags'].tolist() keywords = [word_tokenize(keyword.lower()) for keyword in keywords] keywords = [no_commas(kw) for kw in keywords] processed_keywords = keywords dictionary = Dictionary(processed_keywords) corpus = [dictionary.doc2bow(doc) for doc in processed_keywords] tfidf = TfidfModel(corpus) sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims]) projects = [] for id in top_3: projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data) return Response(projects)
def loadmodel(self, nameprefix): """ Load the topic model with the given prefix of the file paths. Given the prefix of the file paths, load the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :return: None :type nameprefix: str """ # load the JSON file (parameters) parameters = json.load(open(nameprefix + '.json', 'rb')) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] self.classlabels = parameters['classlabels'] # load the dictionary self.dictionary = Dictionary.load(nameprefix + '.gensimdict') # load the topic model self.topicmodel = gensim_topic_model_dict[self.algorithm].load( nameprefix + '.gensimmodel') # load the similarity matrix self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat') # load the tf-idf modek if self.toweigh: self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf') # flag self.trained = True
def train_model_get_cosine_matrix(statements, num): statements = [statement.split() for statement in statements] dictionary = corpora.Dictionary(statements) doc_term_matrix = [dictionary.doc2bow(doc) for doc in statements] ###tfidf model # https://stackoverflow.com/questions/50521304/why-i-get-different-length-of-vectors-using-gensim-lsi-model tfidf = models.TfidfModel(doc_term_matrix, normalize=True) corpus_tfidf = tfidf[doc_term_matrix] lsi = models.LsiModel(corpus_tfidf, num_topics=num, id2word=dictionary) #turn dictionary into doc2vec words = [ dictionary.doc2bow([word]) for word in sorted(list(dictionary.token2id.keys())) ] vectorized_corpus = lsi[words] index = MatrixSimilarity(vectorized_corpus) index[vectorized_corpus] out = pd.DataFrame(index[vectorized_corpus]) out.columns = sorted(list(dictionary.token2id.keys())) out.index = sorted(list(dictionary.token2id.keys())) return out
def main(argv): if len(sys.argv) != 2: print 'usage: text_exp sentence' sys.exit(2) # encode this sentence into semantic space # text = "Rice wheat and barley are all important feed crops." # text = "Brazil issues with industrial pollution" text = sys.argv[1] # first load the basic models MODELS_DIR = "models" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) dictionary = gensim.corpora.Dictionary.load( os.path.join(MODELS_DIR, "mtsamples.dict")) corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "mtsamples.mm")) # now, transform the text bow_text = dictionary.doc2bow(gensim.utils.tokenize(text)) # show me the transformed text # print([(dictionary[id], count) for id, count in bow_text]) # generate a tfidf model from the set of all articles tfidf = gensim.models.TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf[corpus] # then generate a LSI model from the set of all articles lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # now, create a dense index from the set of all articles index_dense = MatrixSimilarity(lsi[corpus]) # finally, let's use the input query and translate it into the lsi space. vec_lsi = lsi[bow_text] # compute the similarity index sims = index_dense[vec_lsi] # print the raw vector numbers # print (list(enumerate(sims))) # now, sort by similarity number and print the highest similar articles to the query sims = sorted(enumerate(sims), key=lambda item: -item[1]) # print (sims) # load the file list file_list = pickle.load(open('models/file_list.p', 'rb')) # use it to show the article names dictSimilars = {} for i in range(len(sims)): ind = sims[i][0] dictSimilars[str(sims[i][1])] = file_list[ind] js = json.dumps(dictSimilars) return js
def cs_lda(sp, df, feature, session, update_model): print(f"Starting LDA ...") if update_model: session = session + "lda-" + feature if not os.path.exists(session): print(f"New directory: {session}") os.mkdir(session) session = session + "/" create_dictionary(session, df, feature) create_model(session, df, feature) else: session = session + "lda-" + feature + "/" print(f"Computing Cosine Similarity on feature {feature}") dct = get_dict(feature, session) if not feature == "title": corpus = common.remove_stopwords(df[feature]).tolist() else: corpus = df[feature].tolist() corpus = [doc.split() for doc in corpus] corpus = [dct.doc2bow(text) for text in corpus] lda = LdaMulticore.load(session + "LDA-model-" + feature) res = lda[corpus] index = MatrixSimilarity(res) # index.save("simIndex.index") def compute(text): vec_bow = dct.doc2bow(text.split()) vec_lda = lda[vec_bow] sims = index[vec_lda] return sims results = for_pivot(df[feature], df, compute) common.save_as_pivot(results, sp=sp)
def get_similarity_index(self, bow_corpus, lsa: LsiModel, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_index(lsa.num_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No similarity index file exists but from_scratch is False') print('Building index...') index = MatrixSimilarity(lsa[bow_corpus]) index.save(filepath) else: print('Loading index...') index = MatrixSimilarity.load(filepath) return index
def cossim(query, documents): # Compute cosine similarity between the query and the documents. query = tfidf[dictionary.doc2bow(query)] index = MatrixSimilarity( tfidf[[dictionary.doc2bow(document) for document in documents]], num_features=len(dictionary)) similarities = index[query] return similarities
def cos_sim(text1, text2): tfidf1 = to_tfidf(text1) tfidf2 = to_tfidf(text2) index = MatrixSimilarity([tfidf1],num_features=len(dictionary)) sim = index[tfidf2] # 本来sim输出是一个array,我们不需要一个array来表示, # 所以我们直接cast成一个float return float(sim[0])
def __init__(self, num_topics=100, wiki_tokens_path='data/token_sents.pkl', wiki_sents_path='data/sents.pkl', student_tokens_path='data/children_data.json'): super().__init__(wiki_tokens_path, wiki_sents_path, student_tokens_path) self.lsi = self.compute_lsi(num_topics) self.lsi_index = MatrixSimilarity(self.lsi[self.wiki_tfidf_corpus])
def lsi(corpus, dictionary): lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_similarity_matrix = MatrixSimilarity(lsi_corpus) print(lsi_model.show_topics()) return lsi_similarity_matrix
def build_models(self): # Create tfidf model self.tfidf_model = TfidfModel(self.corpus) # Map bag of words to (word-index, word-weight) self.tfidf_corpus = list( map(lambda c: self.tfidf_model[c], self.corpus)) self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus) self.lsi_model = LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=100) self.lsi_corpus = list( map(lambda c: self.lsi_model[c], self.tfidf_corpus)) self.lsi_similarity = MatrixSimilarity(self.lsi_corpus)
def generate_embeddings(documents, dictionary): doc_word_index = MatrixSimilarity( [dictionary.doc2bow(document) for document in documents], num_features=len(dictionary)) doc_doc_index = np.array( [doc_word_index[dictionary.doc2bow(doc)] for doc in documents]) np.save('models/Word2vec/doc_word_index', doc_word_index.index) np.save('models/Word2Vec/doc_doc_index', doc_doc_index) return doc_word_index.index, doc_doc_index
def tf_idf(corpus): tfidf_model = TfidfModel(corpus) tfidf_corpus = [] for i in range(len(corpus)): tfidf_corpus.append(tfidf_model[corpus[i]]) tfidf_similarity_matrix = MatrixSimilarity(tfidf_corpus) return tfidf_similarity_matrix
def load(self): """ load the corpora created by `make_corpus.py` """ self.corpus = MmCorpus(self.corpus_file) self.dictionary = Dictionary.load_from_text(self.dict_file) self.titles = load_titles(self.title_file) self.tfidf_model = TfidfModel.load(self.tfidf_model_file) self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.info("loading %s object from %s and %s" % (cls.__name__, fname, fname + ".index")) result = utils.unpickle(fname) result.similarity_index = MatrixSimilarity.load(fname + ".index") return result
def __init__(self, filename): self.docs = loads(open(filename, "r").read()) self.docmap = hoist_dict(self.docs, "id") if isfile("data.dict"): self.dictionary = Dictionary.load("data.dict") else: self.dictionary = Dictionary(iterate_summaries(self.docs)) self.dictionary.save("data.dict") if isfile("data.mm"): self.corpus = MmCorpus("data.mm") else: corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs)) MmCorpus.serialize("data.mm", corpus) self.corpus = MmCorpus("data.mm") self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3) if isfile("data.sim"): self.sim = MatrixSimilarity.load("data.sim") else: self.sim = MatrixSimilarity(self.lsi[self.corpus]) self.sim.save("data.sim") # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1) self.sentiment_model = Doc2Vec.load("imdb.d2v") self.sentiment = LogisticRegression() self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] + [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)], asarray(list(chain(repeat(0, 12500), repeat(1, 12500))))) if isfile("arxiv.d2v"): self.doc_model = Doc2Vec.load("arxiv.d2v") else: tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs] doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) doc_model.build_vocab(tagged) shuffle(tagged) # Replace with functional stuff for epoch in range(10): doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter) doc_model.save("arxiv.d2v")
def find_similarity(vec_lsi, df, k=30): with open('models/LSI_model/corpus_lsi.pickle', 'rb') as handle: corpus_lsi = pickle.load(handle) index = MatrixSimilarity(corpus_lsi, num_features=72) sims = index[vec_lsi] index = sims[0].argsort()[-k:][::-1] for i in index: print(i, "------->", df[i]) return index
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") sim_fname = "%s.cluster.%d.centroids" % (model_prefix, 2000) self.similarity_index = MatrixSimilarity.load(sim_fname, mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
# check and process input arguments if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) logging.info("loading word id mapping from %s", config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids", len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) assert len(input) == len(corpus), \ "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors index = MatrixSimilarity(input, num_best=MAX_SIMILAR + 1, num_features=input.numTerms) else: index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1) index.normalize = False generateSimilar(corpus, index, method) logging.info("finished running %s", program)
logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model{}-200'.format( fname_suffix))) fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index{}'.format( fname_suffix)))] doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) matrix_sim_loc = os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi{}-200_matrix_similarity'.format(fname_suffix)) if not os.path.exists(matrix_sim_loc): logger.info('building matrix similarity') doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) logger.info('persisting matrix similarity index') doc_topic.save(matrix_sim_loc) else: logger.info('matrix similarity already available. using that') doc_topic = MatrixSimilarity.load(matrix_sim_loc) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000)
# load models print "\n Loading models, etc..\n" id2word_pgfin = gensim.corpora.Dictionary.load('./data/pgfin.dictionary') tfidf_model = gensim.models.TfidfModel.load('./data/tfidf_pgfin.model') lsi_model = gensim.models.LsiModel.load('./data/lsi_pgfin.model') indexfile = ('./data/ta_index.txt') queryfile = './queryfiles/queryfile.txt' # text in corpus # queryfile = './queryfiles/45vuotta.txt' # Film review # queryfile = './queryfiles/tktjohdessee2.txt' # Ancient essay # check similarity print "\n Load similarity indices.\n" index = Similarity.load('./data/pgfin_index.index') index_dense = MatrixSimilarity.load('./data/pgfin_matrixindex.index') with open(queryfile, 'r') as datafile: query = datafile.read() # vectorize the query text into bag-of-words and tfidf query_bow = id2word_pgfin.doc2bow(tokenize(query)) query_tfidf = tfidf_model[query_bow] query_lsi = lsi_model[query_tfidf] index_dense.num_best = 5 class BookHitValue(object): def __init__(self, indexfile, author_title, hit_percent):
tfidf_corpus_lsi = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200')) logger.info('loading lsi model') lsi_model = lsimodel.LsiModel.load(os.path.join(settings.PERSIST_DIR, 'lsi_model-200')) fnames = [line.strip() for line in open(os.path.join(settings.PERSIST_DIR, 'document_index'))] doc_ids = pd.Series(map(lambda x: os.path.basename(x).split('.')[0], fnames), dtype=object) #logger.info('building matrix similarity') #doc_topic = MatrixSimilarity(tfidf_corpus_lsi, num_features=tfidf_corpus_lsi.num_terms) #logger.info('persisting matrix similarity index') #doc_topic.save(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) doc_topic = MatrixSimilarity.load(os.path.join(settings.PERSIST_DIR, 'tfidf_corpus_lsi-200_matrix_similarity')) def cluster(group, level, nbranches): if len(group) < min_nodes: logger.info("......less than {min_nodes} nodes ({n})".format( min_nodes=min_nodes, n=len(group))) return mbk = MiniBatchKMeans(init='k-means++', n_clusters=nbranches, n_init=1, init_size=1000, batch_size=1000) mbk.fit(doc_topic.index[group['original_id']]) return mbk def index_freq_above(na, minval): l = pd.Series(na)
# remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20) index = MatrixSimilarity(ldamodel[corpus]) index.save("simIndex.index") print(ldamodel.print_topics(num_topics=30, num_words=2)) doc = stories['cast56'] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = ldamodel[vec_bow] sims = index[vec_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims
import gensim from gensim.similarities import Similarity, MatrixSimilarity # from pgfin_timing import Timer from pgfin_helpers import tokenize logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore # load the corpora print "\n Loading corpora.\n" # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm') # lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm') # tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm') lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm') # print(tfidf_corpus) # print(lsi_corpus) print "\n Start similarity index.\n" index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms) index.save('./data/pgfin_index.index') # save to disk # print index index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms) index_dense.save('./data/pgfin_matrixindex.index') # save to disk # print index_dense
def k_cluster_wiki(input_prefix, output_prefix): k = 2000 delta = 0.001 max_iters = 10 error = float('nan') old_error = float('nan') relative_error_change = float('nan') logger.info("Starting k-means clustering with k=%d, max iters=%d, delta=%f", k, max_iters, delta) m = ESAModel(input_prefix) similarity_index = m.similarity_index dictionary = m.dictionary num_topics = len(similarity_index) num_terms = len(dictionary) # Create initial cluster centroids. # L2-normalize them so we can calculate cosine similarity with a simple dot product. cluster_centroids = normalize(np.random.uniform(size=(k, num_terms))) # The cluster that each document belongs to. cluster_assignments = None logger.info("Preloading memory-mapped shards...") for i, shard in enumerate(similarity_index.shards): shard.get_index() iter = 0 while iter < max_iters: # Calculate cosine similarities between each centroid and each topic. # To save time, we also calculate the error for the previous assignment during this step. logger.info("Calculating cosine similarity of each cluster with each document...") previous_cluster_assignments = np.copy(cluster_assignments) previous_cluster_centroids = np.copy(cluster_centroids) cluster_counts = np.ones(k) # Use ones instead of zeros to avoid divide by zero. cluster_centroids = np.zeros((k, num_terms)) previous_centroid_distances = np.zeros(k) cluster_assignments = [] docid = 0 num_shards = len(similarity_index.shards) for i, shard in enumerate(similarity_index.shards): logger.info("Processing shard %d/%d ...", i, num_shards) # Calculate a (Cluster X Document) cosine similarity matrix for the current shard. # (C X T) . (T X D) = (C X D) logger.info(" Calculating similarities...") cluster_shard_similarities = previous_cluster_centroids * shard.get_index().index.transpose() # Select most similar cluster for each document. logger.info(" Calculating argmax...") cluster_selections = np.argmax(cluster_shard_similarities, axis=0) cluster_assignments = np.hstack([cluster_assignments, cluster_selections]) shard_first_docid = docid # Calculate errors for the previous assignment. # We don't calculate errors on the first iteration since we don't # have an assignment yet. if previous_cluster_assignments.size != 1: # np.copy() of None has size 1 logger.info(" Calculating error...") for doc_cluster_sims in cluster_shard_similarities.transpose(): cluster = previous_cluster_assignments[docid] previous_centroid_distances[cluster] += (1 - doc_cluster_sims[cluster]) docid += 1 # Iteratively recalculate the centroid of each cluster, so we don't # have to swap each shard out and back in. docid = shard_first_docid # Reset docid counter to before the error calcs. logger.info(" Computing new cluster centroids...") for topic_vec in shard.get_index().index: cluster = cluster_assignments[docid] cluster_centroids[cluster] += topic_vec cluster_counts[cluster] += 1 docid += 1 #print("Cluster assignments:", cluster_assignments) cluster_centroids /= cluster_counts[:,None] # Take the average (off by one to avoid /0) cluster_centroids = normalize(cluster_centroids) # And normalize. # We just use the sum of all cosine distances as our error metric. old_error = error error = np.sum(previous_centroid_distances) relative_error_change = abs(1 - error / old_error) logger.info("> Iteration: %d, previous error: %f, old error: %f, rel change: %f", iter, error, old_error, relative_error_change) # TODO: Drop clusters with zero members assigned and merge clusters that # have converged to the same centroid. # Checkpoint the clusterings in every iteration so we can test them # before they converge. # Save centroids. centroids_fname = "%s.cluster.%d.centroids" % (output_prefix, k) logger.info("Saving clusters to file: %s", centroids_fname) s = MatrixSimilarity(None, dtype = np.float64, num_features = num_terms) s.index = cluster_centroids s.save(centroids_fname) del s # Free any RAM the similarity index might use. # Save assignments. assignments_fname = "%s.cluster.%d.assignments" % (output_prefix, k) logger.info("Saving cluster assignments to file: %s", assignments_fname) np.save(open(assignments_fname, 'wb'), cluster_assignments) if relative_error_change < delta: logger.info("Converged.") break iter += 1 logger.info("Done.")
# query = 'oil and gas' # from src.engine.preprocess import preprocess_body_lda # query = preprocess_body_lda(query) # corpus_query = [dictionary.doc2bow(query.split(" "))] # transformed = tfidf[corpus_query] # # logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True) # logentropy.save(settings.LOGENTROPY_MODEL) # logentropy_query = logentropy[transformed] lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=30, passes=3, alpha='auto', chunksize=4000) lsi.save(settings.LDA_MODEL) lsi = models.LdaModel.load(settings.LDA_MODEL) from gensim.similarities import MatrixSimilarity similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100) similarity_matrix.save(settings.SIMILARITY_MATRIX) # similarities = similarity_matrix.get_similarities(lsi[logentropy_query]) # # # # lsi_query = lsi[logentropy_query] from gensim import matutils # matutils.cossim(lsi.) # passes = 1, per = 11000; alpha='auto', per=9200
# check and process input arguments if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) else: index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format logging.info("finished running %s" % program)