def compute(self): vec_texts = [text.split() for text in self.texts] write("\n "+"-> Computing the dictionary".ljust(50,'.')) if self.debug else '' dictionary = Dictionary(vec_texts) write("[OK]") if self.debug else '' write("\n "+"-> Creating the bag-of-words space".ljust(50,'.')) if self.debug else '' corpus = [dictionary.doc2bow(vec) for vec in vec_texts] write("[OK]") if self.debug else '' write("\n "+("-> Creating the %s space" % self.method).ljust(50,'.') ) if self.debug else '' tfidf_space = TfidfModel(corpus) tfidf_corpus = tfidf_space[corpus] if self.method == 'TFIDF': self.space = tfidf_space self.index = MatrixSimilarity(tfidf_corpus) elif self.method == 'LSI': self.space = LsiModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'RP': self.space = RpModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) elif self.method == 'LDA': self.space = ldamodel.LdaModel(tfidf_corpus, id2word=dictionary, num_topics=self.num_t) self.index = MatrixSimilarity(self.space[tfidf_corpus]) self.dictionary = dictionary write("[OK]\n") if self.debug else ''
def build_tfidf_or_lsi(corpus, method='tfidf'): ''' построение модели для ранжирования документов. На вход: корпус текстов и метод ("tfidf" или "lsi"). На выход кортеж: (словарь терминов в корпусе текстов, оцененная модель и матрица сходств слов) ''' dictionary = Dictionary(corpus) corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] model_tfidf = TfidfModel(corpus_bow) corpus_tfidf = [model_tfidf[doc] for doc in corpus_bow] simil_tfidf = MatrixSimilarity(corpus_tfidf) if method == 'tfidf': return dictionary, model_tfidf, simil_tfidf elif method == 'lsi': model_lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) corpus_lsi = [model_lsi[doc] for doc in corpus_bow] simil_lsi = MatrixSimilarity(corpus_lsi) return dictionary, model_lsi, simil_lsi
def trainModel(self): if self.toweight: self.model = LsiModel(self.tfidf[self.corpus], num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.tfidf[self.corpus]]) else: self.model = LsiModel(self.corpus, num_topics=self.num_topics) self.index = MatrixSimilarity(self.model[self.corpus])
def custom_queries(corpus, dictionary, paragraphs): # tfidf query: tfidf_model = TfidfModel(corpus, dictionary=dictionary) query = process_query("What is the function of money?", dictionary) tfidf_query = tfidf_model[query] tfidf_corpus = [] for i in range(len(corpus)): tfidf_corpus.append(tfidf_model[corpus[i]]) tfidf_index = MatrixSimilarity(tfidf_corpus) print("tfidf query:") doc2similarity_tfidf = enumerate(tfidf_index[tfidf_query]) for tfidf_index, similarity in sorted(doc2similarity_tfidf, key=lambda kv: -kv[1])[:3]: paragraph = paragraphs[tfidf_index].split("\n") number = tfidf_index + 1 print("[paragraph: " + str(number) + "]") for i in range(5): print(paragraph[i]) if (i + 1) == len(paragraph): break print("\n") # lsi query: lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_query = lsi_model[tfidf_query] lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_index = MatrixSimilarity(lsi_corpus) doc2similarity_lsi = enumerate(lsi_index[lsi_query]) print("lsi query:") for lsi_index, similarity in sorted(doc2similarity_lsi, key=lambda kv: -kv[1])[:3]: paragraph = paragraphs[lsi_index].split("\n") number = lsi_index + 1 print("[paragraph: " + str(number) + "]") for i in range(5): print(paragraph[i]) if (i + 1) == len(paragraph): break print("\n")
def recomended_projects(self, request): projects = ProjectRequest.objects.all() project_keywords_dict = {} projects_dict = {} tags_list = [] for project in projects: description = project.description description_keywords = get_keywords(description.replace('"', '')) tags = project.tags.replace(' ', ',').lower() for keyword in description_keywords: tags += ',' + keyword[0].lower() tags_list.append(tags) df = read_frame(projects, fieldnames=['id', 'tags'], index_col=['id']) df['tags'] = tags_list keywords = df['tags'].tolist() keywords = [word_tokenize(keyword.lower()) for keyword in keywords] keywords = [no_commas(kw) for kw in keywords] processed_keywords = keywords dictionary = Dictionary(processed_keywords) corpus = [dictionary.doc2bow(doc) for doc in processed_keywords] tfidf = TfidfModel(corpus) sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) top_3 = keywords_recommendation(all_projects=df, keywords=['uvg', 'gasolina', 'potente', 'mcdonald', 'mecanico', 'gg', 'carros'], number_of_hits=3, data=[dictionary, tfidf, sims]) projects = [] for id in top_3: projects.append(ProjectRequestSerializer(ProjectRequest.objects.get(pk=id)).data) return Response(projects)
def train_model_get_cosine_matrix(statements, num): statements = [statement.split() for statement in statements] dictionary = corpora.Dictionary(statements) doc_term_matrix = [dictionary.doc2bow(doc) for doc in statements] ###tfidf model # https://stackoverflow.com/questions/50521304/why-i-get-different-length-of-vectors-using-gensim-lsi-model tfidf = models.TfidfModel(doc_term_matrix, normalize=True) corpus_tfidf = tfidf[doc_term_matrix] lsi = models.LsiModel(corpus_tfidf, num_topics=num, id2word=dictionary) #turn dictionary into doc2vec words = [ dictionary.doc2bow([word]) for word in sorted(list(dictionary.token2id.keys())) ] vectorized_corpus = lsi[words] index = MatrixSimilarity(vectorized_corpus) index[vectorized_corpus] out = pd.DataFrame(index[vectorized_corpus]) out.columns = sorted(list(dictionary.token2id.keys())) out.index = sorted(list(dictionary.token2id.keys())) return out
def main(argv): if len(sys.argv) != 2: print 'usage: text_exp sentence' sys.exit(2) # encode this sentence into semantic space # text = "Rice wheat and barley are all important feed crops." # text = "Brazil issues with industrial pollution" text = sys.argv[1] # first load the basic models MODELS_DIR = "models" logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) dictionary = gensim.corpora.Dictionary.load( os.path.join(MODELS_DIR, "mtsamples.dict")) corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "mtsamples.mm")) # now, transform the text bow_text = dictionary.doc2bow(gensim.utils.tokenize(text)) # show me the transformed text # print([(dictionary[id], count) for id, count in bow_text]) # generate a tfidf model from the set of all articles tfidf = gensim.models.TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf[corpus] # then generate a LSI model from the set of all articles lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # now, create a dense index from the set of all articles index_dense = MatrixSimilarity(lsi[corpus]) # finally, let's use the input query and translate it into the lsi space. vec_lsi = lsi[bow_text] # compute the similarity index sims = index_dense[vec_lsi] # print the raw vector numbers # print (list(enumerate(sims))) # now, sort by similarity number and print the highest similar articles to the query sims = sorted(enumerate(sims), key=lambda item: -item[1]) # print (sims) # load the file list file_list = pickle.load(open('models/file_list.p', 'rb')) # use it to show the article names dictSimilars = {} for i in range(len(sims)): ind = sims[i][0] dictSimilars[str(sims[i][1])] = file_list[ind] js = json.dumps(dictSimilars) return js
def calculate_tfidf_cos_sim(text1, text2, dictionary_tfidf, corpus): tfidf1 = to_tfidf(text1, dictionary_tfidf, corpus) tfidf2 = to_tfidf(text2, dictionary_tfidf, corpus) index = MatrixSimilarity([tfidf1], num_features=len(dictionary_tfidf)) sim = index[tfidf2] return float(sim[0])
def calAuthorSim(): conn = sqlite3.connect(config.db_path) db = conn.cursor() model = AuthorTopicModel.load(config.author_model128_path) poets = list(model.id2author.values()) print(len(poets)) # vec = model.get_author_topics('苏轼') index = MatrixSimilarity(model[list(model.id2author.values())], num_best=30) index.save(config.author_simMatrix_path) # index = MatrixSimilarity.load(config.author_simMatrix_path) for name in poets: # print(name) sims = index[model[name]] sims = sorted(sims, key=lambda item: -item[1]) sims = [ [poets[sim[0]] , sim[1]] for sim in sims] # print(sims) # sql_comment = "UPDATE author SET sims=? WHERE id=?" # db.execute(sql_comment, (toJson(sims), name)) sql_comment = "UPDATE author SET sims=\'{}\' WHERE id=\'{}\'".format(toJson(sims), name) db.execute(sql_comment) # print(sql_comment) # print(len(poets)) conn.commit()
def find_similarity_scores(self, topics): # Create similarities container similarities = {'Resumes': {}} # Gensim requires a corpora data structure for transformations and analysis dictionary = corpora.Dictionary(self.corpus) # Convert text to BoW. It already is but lets be sure. corpus_gensim = [dictionary.doc2bow(doc) for doc in self.corpus] # Term Frequency-Inverse Document Frequency (TF-IDF) transformation sets weights small # when they appear more often in the text. self.tfidf = TfidfModel(corpus_gensim) print(self.tfidf) self.tfidf = self.tfidf[corpus_gensim] print(self.tfidf) # Find similarity via vector-space pair-wise cosine angle absolute value via Latent Semantic Indexing (LSI) # https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing lsi = LsiModel(self.tfidf, id2word=dictionary, num_topics=topics) lsi_index = MatrixSimilarity(lsi[self.tfidf]) similarities['Resumes']["LSI_Similarity"] = np.array( [lsi_index[lsi[self.tfidf[i]]] for i in range(len(self.corpus))]) for doc in self.tfidf: for word_id, value in doc: word = dictionary.get(word_id) self.ind_word_scores[word] = value # Convert to numpy arrays self.f_list = np.array(self.f_list) self.data = np.array(self.data) # Return results to object self.sim_matrix = similarities
def train_model(self): """ Read the preprocessed data and generate corpus dictionary, tfidf model and matrix(Cosine) similarity :return: status of training """ try: data = pd.read_csv(self.processed_data) del data['Unnamed: 0'] # creating tokens for the doc column corpus = data['doc'].map(break_to_tokens) # creating dictionary of words in the movie dataset dictionary = gensim.corpora.Dictionary(corpus) dictionary.save(self.corpus_dictionary) # creating vector with bag of words for the corpus vector = [dictionary.doc2bow(d) for d in corpus] # creating tfidf values for the vector tfidf = models.TfidfModel(vector) tfidf.save(self.tfidf_model) corpus_tfidf = tfidf[vector] # Compute Similarities similarity = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary)) similarity.save(self.matrix_similarity) return "Model Trained Successfully" except: return "Error While Training Model"
def train(self, classdict, nb_topics, *args, **kwargs): """ Train the topic modeler. :param classdict: training data :param nb_topics: number of latent topics :param args: arguments to pass to the `train` method for gensim topic models :param kwargs: arguments to pass to the `train` method for gensim topic models :return: None :type classdict: dict :type nb_topics: int """ self.nb_topics = nb_topics self.generate_corpus(classdict) if self.toweigh: self.tfidf = TfidfModel(self.corpus) normcorpus = self.tfidf[self.corpus] else: self.tfidf = None normcorpus = self.corpus self.topicmodel = gensim_topic_model_dict[self.algorithm]( normcorpus, num_topics=self.nb_topics, *args, **kwargs) self.matsim = MatrixSimilarity(self.topicmodel[normcorpus]) # change the flag self.trained = True
def cs_lda(sp, df, feature, session, update_model): print(f"Starting LDA ...") if update_model: session = session + "lda-" + feature if not os.path.exists(session): print(f"New directory: {session}") os.mkdir(session) session = session + "/" create_dictionary(session, df, feature) create_model(session, df, feature) else: session = session + "lda-" + feature + "/" print(f"Computing Cosine Similarity on feature {feature}") dct = get_dict(feature, session) if not feature == "title": corpus = common.remove_stopwords(df[feature]).tolist() else: corpus = df[feature].tolist() corpus = [doc.split() for doc in corpus] corpus = [dct.doc2bow(text) for text in corpus] lda = LdaMulticore.load(session + "LDA-model-" + feature) res = lda[corpus] index = MatrixSimilarity(res) # index.save("simIndex.index") def compute(text): vec_bow = dct.doc2bow(text.split()) vec_lda = lda[vec_bow] sims = index[vec_lda] return sims results = for_pivot(df[feature], df, compute) common.save_as_pivot(results, sp=sp)
def cossim(query, documents): # Compute cosine similarity between the query and the documents. query = tfidf[dictionary.doc2bow(query)] index = MatrixSimilarity( tfidf[[dictionary.doc2bow(document) for document in documents]], num_features=len(dictionary)) similarities = index[query] return similarities
def cos_sim(text1, text2): tfidf1 = to_tfidf(text1) tfidf2 = to_tfidf(text2) index = MatrixSimilarity([tfidf1],num_features=len(dictionary)) sim = index[tfidf2] # 本来sim输出是一个array,我们不需要一个array来表示, # 所以我们直接cast成一个float return float(sim[0])
def __init__(self, num_topics=100, wiki_tokens_path='data/token_sents.pkl', wiki_sents_path='data/sents.pkl', student_tokens_path='data/children_data.json'): super().__init__(wiki_tokens_path, wiki_sents_path, student_tokens_path) self.lsi = self.compute_lsi(num_topics) self.lsi_index = MatrixSimilarity(self.lsi[self.wiki_tfidf_corpus])
def generate_embeddings(documents, dictionary): doc_word_index = MatrixSimilarity( [dictionary.doc2bow(document) for document in documents], num_features=len(dictionary)) doc_doc_index = np.array( [doc_word_index[dictionary.doc2bow(doc)] for doc in documents]) np.save('models/Word2vec/doc_word_index', doc_word_index.index) np.save('models/Word2Vec/doc_doc_index', doc_doc_index) return doc_word_index.index, doc_doc_index
def build_models(self): # Create tfidf model self.tfidf_model = TfidfModel(self.corpus) # Map bag of words to (word-index, word-weight) self.tfidf_corpus = list( map(lambda c: self.tfidf_model[c], self.corpus)) self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus) self.lsi_model = LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=100) self.lsi_corpus = list( map(lambda c: self.lsi_model[c], self.tfidf_corpus)) self.lsi_similarity = MatrixSimilarity(self.lsi_corpus)
def lsi(corpus, dictionary): lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100) lsi_corpus = [] for i in range(len(corpus)): lsi_corpus.append(lsi_model[corpus[i]]) lsi_similarity_matrix = MatrixSimilarity(lsi_corpus) print(lsi_model.show_topics()) return lsi_similarity_matrix
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidfmodel = TfidfModel(corpus) index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary)) index.save(matrix_name) tfidfmodel.save(model_name) dictionary.save(dic_name) return tfidfmodel, index, dictionary
def tf_idf(corpus): tfidf_model = TfidfModel(corpus) tfidf_corpus = [] for i in range(len(corpus)): tfidf_corpus.append(tfidf_model[corpus[i]]) tfidf_similarity_matrix = MatrixSimilarity(tfidf_corpus) return tfidf_similarity_matrix
def load(self): """ load the corpora created by `make_corpus.py` """ self.corpus = MmCorpus(self.corpus_file) self.dictionary = Dictionary.load_from_text(self.dict_file) self.titles = load_titles(self.title_file) self.tfidf_model = TfidfModel.load(self.tfidf_model_file) self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
def find_similarity(vec_lsi, df, k=30): with open('models/LSI_model/corpus_lsi.pickle', 'rb') as handle: corpus_lsi = pickle.load(handle) index = MatrixSimilarity(corpus_lsi, num_features=72) sims = index[vec_lsi] index = sims[0].argsort()[-k:][::-1] for i in index: print(i, "------->", df[i]) return index
def soft_cosine_similarity(text_1, text_2, corpus): dictionary = Dictionary(corpus) text_1 = dictionary.doc2bow(text_1) text_2 = dictionary.doc2bow(text_2) w2v_model = Word2Vec(corpus, workers=cpu_count(), min_count=1, size=300, seed=12345) similarity_matrix = sparse.csr_matrix( MatrixSimilarity(Dense2Corpus(w2v_model.wv.syn0.T))) return softcossim(text_1, text_2, similarity_matrix)
def cos_sim(text1, text2): ''' Calculate cosine similarity between two texts :param text1: input string :param text2: input string :return: cosine similarity ''' tfidf1 = to_tfidf(text1) tfidf2 = to_tfidf(text2) index = MatrixSimilarity([tfidf1],num_features=len(dictionary)) sim = index[tfidf2] return float(sim[0])
def news_recommend_keywords(keywords, num=10): keywords = [word for word in keywords.split()] path_df = "Pickles/News_central_rec2.pickle" with open(path_df, 'rb') as data: df = pickle.load(data) df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): words = [] Words = '' for col in columns: if col == 'Content': words += row[col].split() words = list(set(words)) row['bag_of_words'] = words processed_keywords = df.bag_of_words.to_list() dictionary = Dictionary( processed_keywords) # create a dictionary of words from our keywords corpus = [dictionary.doc2bow(doc) for doc in processed_keywords] #create corpus where the corpus is a bag of words for each document tfidf = TfidfModel(corpus) #create tfidf model of the corpus # Create the similarity data structure. This is the most important part where we get the similarities between the news. sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary)) query_doc_bow = dictionary.doc2bow( keywords) # get a bag of words from the query_doc query_doc_tfidf = tfidf[ query_doc_bow] #convert the regular bag of words model to a tf-idf model where we have tuples # of the news ID and it's tf-idf value for the news similarity_array = sims[ query_doc_tfidf] # get the array of similarity values between our news and every other news. #So the length is the number of news we have. To do this, we pass our list of tf-idf tuples to sims. similarity_series = pd.Series(similarity_array.tolist(), index=df.Title.values) #Convert to a Series top_hits = similarity_series.sort_values( ascending=False)[:num] #get the top matching results, # i.e. most similar news titles = [] scores = [] for idx, (title, score) in enumerate(zip(top_hits.index, top_hits)): #print("%d '%s' with a similarity score of %.3f" %(idx+1, title, score)) titles.append(title) scores.append(score) return titles, scores
def cossim(query, documents, dictionary, num_best=30): # Compute cosine similarity between the query and the documents. temp = [] temp.append(query) input_sentence = word_tokenizer( remove_null_sentence(text_preprocessing(sentences=temp))) query = dictionary.doc2bow(input_sentence[0]) index = MatrixSimilarity( [dictionary.doc2bow(document) for document in documents], num_features=len(dictionary)) similarities = index[query] most_similar_index = similarities.argsort()[-num_best:][::-1] return most_similar_index
def index(self, corpus, mode="MatrixSimilarity"): if mode == "MatrixSimilarity": self._index = MatrixSimilarity(self.corpus, num_features=self.num_features) elif mode == "SparseMatrixSimilarity": self._index = SparseMatrixSimilarity( self.corpus, num_features=self.num_features) else: raise TypeError( "mode has to be either MatrixSimilarity or SparseMatrixSimilarity" ) return self._index[corpus]
def semantic_vector_similarity(dataset, test_data): """ Mean cosine similarity to other essays’ semantic vector :param dataset: list :return: """ print("semantic_vector_similarity") cacheStopWords = pw.words("english") punc = ['.', ',', '?', '!', '@', '"', 'n\'t'] cacheStopWords.extend(punc) token_sets = [] # print(cacheStopWords) for data in dataset: essay_token = [word for word in data['essay_token'] if word.lower() not in cacheStopWords] token_sets.append(essay_token) dictionary = corpora.Dictionary(token_sets) corpus = [dictionary.doc2bow(tokens) for tokens in token_sets] lsi_model = models.LsiModel(corpus, id2word=dictionary, num_topics=20) documents = lsi_model[corpus] topics = lsi_model.show_topics(num_words=5, log=0) Min = 9999 Max = 0 scores = numpy.array([sample['domain1_score'] for sample in dataset]) index = MatrixSimilarity(documents) predict_score_list = [] score_list = [] for sample, essay in zip(test_data, corpus): query = essay query_vec = lsi_model[query] # print(query) sim = index[query_vec] idxs = sim.argsort()[-20:-1][::-1] # print(idxs) _sim = [sim[idx] for idx in idxs] _scores = [scores[idx] for idx in idxs] # print(sim) predict_score = numpy.sum(numpy.multiply(scores, sim)) / len(dataset) sample['semantic_vector_similarity'] = predict_score # print(predict_score, sample['domain1_score']) predict_score_list.append(predict_score) # score_list.append(sample['domain1_score']) # plt.plot(predict_score_list, score_list, 'o') # plt.show() return numpy.array(predict_score_list).reshape(-1, 1)
def main(self): print("Recommendation using TF_IDF") # Loading preprocessed data vagas_ti = pd.read_csv(self.dataPrepFile) vagas_ids = pickle.load( open(self.out + "preprocessing/vagas_ids.array", "rb")) vagas_words = pickle.load( open(self.out + "preprocessing/vagas_words.list", "rb")) cvs_words = pickle.load( open(self.out + "preprocessing/cvs_words.series", "rb")) cvs = pd.read_csv(self.dataCvsFile) cvs = cvs.fillna("") cvs.isnull().any() #print("Loading cvs done!") # Creating a dictionary dictionary = gcorp.Dictionary(vagas_words) dictionary.save(self.out + 'preprocessing/tf_idf/vagas.dict' ) # store the dictionary, for future reference # compile corpus (vectors number of times each elements appears) raw_corpus = [dictionary.doc2bow(v) for v in vagas_words] gcorp.MmCorpus.serialize(self.out + 'preprocessing/tf_idf/vagas.mm', raw_corpus) # store to disk print("Tamanho do dicionário: " + str(len(dictionary))) # STEP 2 : similarity between corpuses dictionary = gcorp.Dictionary.load(self.out + 'preprocessing/tf_idf/vagas.dict') corpus = gcorp.MmCorpus(self.out + 'preprocessing/tf_idf/vagas.mm') # Transform Text with TF-IDF tfidf = gsm.TfidfModel(corpus) # step 1 -- initialize a model # corpus tf-idf corpus_tfidf = tfidf[corpus] # STEP 3 : Create similarity matrix of all files index = MatrixSimilarity(corpus_tfidf, num_features=len(dictionary), num_best=10) index.save(self.out + 'preprocessing/tf_idf/vagas.index') index = MatrixSimilarity.load(self.out + 'preprocessing/tf_idf/vagas.index') self.recommendationTf_idf(cvs, vagas_ti, vagas_ids, cvs_words, dictionary, tfidf, index) print("Recommendation using TF_IDF done!")