def search_SkipGram(model, query, id2corpus=ID2CORPUS, result_len=MAX_NUMBER_OF_RESULTS): """ Return the top 1000 ranked documents that match best the query according to the input model. """ # Create a vector representation of the query. query_repr = [] for q_tok in read_ap.process_text(query): if q_tok in model.tok2idx: query_repr.append(model.tok2idx[q_tok]) q_vec = model.doc2vec(query_repr).unsqueeze(dim=0) q_vec_norm = torch.mm(q_vec, q_vec.T) print('Comparing all document vectors to the query vector...') results = [] for doc_id, doc in id2corpus.items(): vec = model.doc2vec(doc).unsqueeze(dim=0) norm = torch.mm(vec, vec.T) score = torch.mm(vec, q_vec.T) / (norm * q_vec_norm) results.append((doc_id, float(score))) results.sort(key=lambda _: -_[1]) return results[:result_len]
def similar_words(vocab_embs, word, n_of_similar_words, word2id, id2word): ''' Takes a word, gets the corresponding word embedding, and computes the cosine similarity score with the embeddings of all other words in the vocab. Returns: similar words: list of n most similar words. ''' word = ra.process_text(word)[0] word_id = word2id[word] word_emb = vocab_embs[word_id, :].reshape(1, -1) # Compute cosine similarity score for every word in the vocabulary scores = sklearn.metrics.pairwise.cosine_similarity(word_emb, vocab_embs, dense_output=True) scores = list(scores[0]) # Get n words with highest scores best_n = heapq.nlargest(n_of_similar_words, range(len(scores)), scores.__getitem__) similar_words = [] for id in best_n: similar_words.append(id2word[id]) return (similar_words)
def rank_documents(model, model_name, type, query): sims_list = [] processed_query = read_ap.process_text(query) print(processed_query) if model_name == "LSI": if type == "bow": # calculating cosine similarity for LSI (BoW) index = gensim.similarities.MatrixSimilarity(model[corpus]) #make a bow representation of the query, and split the words vec_bow = dictionary.doc2bow(processed_query) vec_lsi = model[vec_bow] # convert the query to LSI space sims = index[vec_lsi] # get index sims = sorted(enumerate(sims), key=lambda item: -item[1]) # store the scores with the associated doc id's for the retrieval evaluation doc_ids = list(new_docs.keys()) for i, s in sims: sims_list.append((doc_ids[i], np.float64(s))) return sims_list if type == "tfidf": #calculating cosine similarity for LSI, tf idf using similarities #use the tfidf corpus -> lsi corpus corpus_lsi = model[corpus_tfidf] #transform corpus to LSI space and index it index = gensim.similarities.MatrixSimilarity(corpus_lsi) #convert query to lsi space via tf-idf vec_bow = dictionary.doc2bow(processed_query) vec_lsi = model[vec_bow] sims = index[vec_lsi] #same as with LSI BoW sims = sorted(enumerate(sims), key=lambda item: -item[1]) doc_ids = list(new_docs.keys()) for i, s in sims: sims_list.append((doc_ids[i], np.float64(s))) return sims_list else: #calculating the negative Kullback–Leibler divergence scores for LDA #transform query vec_bow = dictionary.doc2bow(processed_query) # transform query to the LDA space vec_lda_query = model[vec_bow][0] kl_divergence = [] for text in corpus: #transform current document text in bow space to lda space vec_lda_text = model[text][0] # KL(Q||D) =\sum_w p(w|Q) log p(w|D) as explained in http://times.cs.uiuc.edu/course/410s11/kldir.pdf, using gensim mathutil kl_divergence.append(kullback_leibler(vec_lda_query, vec_lda_text)) #sims = index[vec_lda] #sort the kl scores kl_divergence = sorted(enumerate(kl_divergence), key=lambda item: -item[1]) doc_ids = list(new_docs.keys()) for i, s in kl_divergence: sims_list.append((doc_ids[i], np.float64(s))) return sims_list
def rank(model, docs, query_raw): query = process_text(query_raw) query_vector = model.infer_vector(query) ranking = model.docvecs.most_similar([query_vector], topn=len(model.docvecs)) return ranking
def doc2vec_search(model, query): processed_query = read_ap.process_text(query) inferred_vector = model.infer_vector(processed_query) sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) sims = [(keys_tags_dict[doc_id], np.float64(score)) for (doc_id, score) in sims] return sims
def rank_query_given_document(query_text, doc2vec_model): # Function that ranks documents given a query query_repr = read_ap.process_text(query_text) query_vector = doc2vec_model.infer_vector(query_repr) results = doc2vec_model.docvecs.most_similar([query_vector], topn=len( doc2vec_model.docvecs)) return results
def match_query_against_docs(self, query, doc_ids, doc_embeddings): query_repr = read_ap.process_text(query) q_embeddings = self.model.inference_on_words(query_repr) q_embedding = aggregate_embeddings(q_embeddings, method=self.ARGS.aggr) similarities, sorted_doc_idx = calc_cosine_similarity(q_embedding, doc_embeddings) results = [(doc_ids[i.item()], similarities[i.item()].item()) for i in sorted_doc_idx] return results
def rank_documents(model, model_name, type, query): sims_list = [] processed_query = read_ap.process_text(query) print(processed_query) if model_name == "LSI": if type == "bow": # calculating cosine similarity for LSI (BoW) index = gensim.similarities.MatrixSimilarity(model[corpus]) #make a bow representation of the query, and split the words vec_bow = dictionary.doc2bow(processed_query) print(query.lower().split()) vec_lsi = model[vec_bow] # convert the query to LSI space sims = index[vec_lsi] # print(sims) sims = sorted(enumerate(sims), key=lambda item: -item[1]) # store the scores with the associated doc id's for the retrieval evaluation for i, s in sims: doc_id = list(new_docs.keys())[i] sims_list.append((doc_id, np.float64(s))) return sims_list if type == "tfidf": #calculating cosine similarity for LSI, tf idf using similarities #use the tfidf corpus -> lsi corpus corpus_lsi = model[corpus_tfidf] #transform corpus to LSI space and index it index = gensim.similarities.MatrixSimilarity(corpus_lsi) #convert query to lsi space via tf-idf vec_bow = dictionary.doc2bow(processed_query) vec_lsi = model[vec_bow] sims = index[vec_lsi] # pprint(sims) #same as with LSI BoW sims = sorted(enumerate(sims), key=lambda item: -item[1]) for i, s in sims: doc_id = list(new_docs.keys())[i] sims_list.append((doc_id, np.float64(s))) return sims_list else: #calculating the negative Kullback–Leibler divergence scores for LDA lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS) index = gensim.similarities.MatrixSimilarity(lda[corpus]) vec_bow = dictionary.doc2bow(query.lower().split()) vec_lda = lda[vec_bow] sims_index = index[vec_lda] sims = [(doc, gensim.matutils.kullback_leibler(doc, vec_lda)) for doc in sims_index] sims = sorted(enumerate(sims), key=lambda item: -item[1]) for i, s in sims: doc_id = list(new_docs.keys())[i] sims_list.append((doc_id, np.float64(s))) return sims_list
def compute_metrics(docs, vocab_embs, word2id, id2word): """ For a trained model, compute the MAP and NDCG based on a set of queries and all documents in the corpus. Returns: metrics: a nested dict of queries and their MAP and NDCG scores. """ # Create document embeddings if not os.path.exists("./pickles/word2vec_doc_embs.pkl"): print("constructing document embeddings") doc_embs = {} keys = list(docs.keys()) for d in tqdm(keys): doc = docs[d] doc_emb = create_doc_emb(vocab_embs, doc, word2id, id2word) doc_embs[d] = doc_emb with open("./pickles/word2vec_doc_embs.pkl", "wb") as writer: pkl.dump(doc_embs, writer) else: with open("./pickles/word2vec_doc_embs.pkl", "rb") as reader: doc_embs = pkl.load(reader) # Create query embedding and compare to every docuemnt embedding qrels, queries = ra.read_qrels() overall_ser = {} #ranking per query for qid in tqdm(qrels): query = queries[qid] query = ra.process_text(query) query_emb = create_doc_emb(vocab_embs, query, word2id, id2word) ranking, trec_results = get_ranking(qid, query_emb, doc_embs, vocab_embs) overall_ser[qid] = ranking if not int(qid) in range(76, 100): with open("./results/word2vec_trec.csv", "a+") as f: f.write("\n".join("{},{},{},{},{},{}".format( x[0], x[1], x[2], x[3], x[4], x[5]) for x in trec_results)) f.write("\n") # Compute the MAP and NDCG per query evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'}) metrics = evaluator.evaluate(overall_ser) # Get the average model evaluation scores over all queries average = {'map': 0, 'ndcg': 0} for q in list(metrics.values()): average['map'] += q['map'] average['ndcg'] += q['ndcg'] average['map'] = average['map'] / len(queries) average['ndcg'] = average['ndcg'] / len(queries) print( 'average model evaluation scores over all queries {}'.format(average)) return (metrics)
def search(self, query, max_docs=1000): query_repr = read_ap.process_text(query) vec_bow = self.dictionary.doc2bow(query_repr) vec_lsi = self.model[vec_bow] sims = self.index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) results = [(self.doc_index_map[doc_id], score.item()) for doc_id, score in sims[:max_docs]] return results
def match_query_against_words(self, query): query_repr = read_ap.process_text(query) q_embeddings = self.model.inference_on_words(query_repr) # If the query is a sentence we can compare the sentence against words agg_embeddings = aggregate_embeddings(q_embeddings, method=self.ARGS.aggr) _, sorted_w_idx = calc_cosine_similarity(agg_embeddings, self.model.w_embeddings.weight) results = [self.model.vocab["id2token"][i.item()] for i in sorted_w_idx[:self.ARGS.top_n]] return results
def search(self, query): query_repr = read_ap.process_text(query) orig = self.get_doc_vec(query_repr) orig = orig.unsqueeze(1).repeat(1, len(self.docs)) cos = nn.CosineSimilarity(dim=0, eps=1e-6) prod = cos(orig, self.doc_vecs) print('sorting results') indices = (-prod.numpy()).argsort() results = [(self.idx2docid[index], float(prod.numpy()[index])) for index in indices] return results
def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) if self.embedding == "bow": lsi_query = self.model[vec_query] elif self.embedding == "tfidf": lsi_query = self.model[self.corpus.tfidf_model[vec_query]] sims = self.index[lsi_query] sims = sorted(zip(self.corpus.doc_ids, sims), key=lambda item: -item[1]) return sims
def search(self, query): query_repr = self.dictionary.doc2bow(read_ap.process_text(query)) qvec = np.zeros(self.model.num_topics) for i, frac in self.model[query_repr]: qvec[i] = frac results = {} for doc in self.docvecs: results[doc] = -kl_divergence(self.docvecs[doc], qvec) results = list(results.items()) results.sort(key=lambda _: -_[1]) return results
def rank_docs(model, query, qid, run_id): """ Use a trained model to return the most similar docs to a query. Returns: sims: list of tuples (doc_id, score) trec_results: list of tuples with all TREC values per doc """ query = ra.process_text(query) query_vec = model.infer_vector(query, epochs=200) sims = model.docvecs.most_similar([query_vec], topn=len(model.docvecs)) trec_results = [(qid,"") + (tup[0],) + (i,) + (tup[1],) + (run_id,) for i,tup in enumerate(sims)] return sims, trec_results
def search(self, query): query_repr = read_ap.process_text(query) results = defaultdict(float) for query_term in query_repr: if query_term not in self.ii: continue for (doc_id, tf) in self.ii[query_term]: results[doc_id] += np.log(1 + tf) / self.df[query_term] results = list(results.items()) results.sort(key=lambda _: -_[1]) return results
def search(config): if not os.path.exists(config.model_file): raise ValueError("no model available for search, try setting '-t' to true to train model first") else: model = gensim.models.doc2vec.Doc2Vec.load(config.model_file) query = read_ap.process_text(config.search) vector = model.infer_vector(query) most_similar = model.docvecs.most_similar([vector], topn=config.top_n) display_result(most_similar) return most_similar
def search(self, query): query_repr = read_ap.process_text(query) results = defaultdict(float) for query_term in query_repr: if query_term not in self.ii: continue for (doc_id, tf) in self.ii[query_term]: # divide by df is apparently an approximation of inverse df... but why no log? results[doc_id] += np.log(1 + tf) / self.df[query_term] results = list(results.items()) results.sort(key=lambda _: -_[1]) return results
def query(self, q): # get doc representation q = read_ap.process_text(q) q = self.dictionary.doc2bow(q) # convert vector to LSI space vec_query = self.model[q] sims = self.index[vec_query] sims = sorted(enumerate(sims), key=lambda item: -item[1]) return sims
def embed_query(self, word_to_vec, query, aggregation='mean'): query_repr = process_text(query) doc = [] for query_term in query_repr: if query_term not in word_to_vec: continue else: doc.append(word_to_vec[query_term]) if aggregation == 'mean': doc = np.mean(doc, axis=0) return doc
def find_similar_words(self, query, n=11): word_to_vec = pkl.load(open("word2vec_embedding.pkl", "rb")) query = process_text(query)[0] word_vec = word_to_vec[query] distances = [] for index, word_key in enumerate(word_to_vec): cos_sim = np.dot(word_vec, word_to_vec[word_key]) / ( np.linalg.norm(word_vec) * np.linalg.norm(word_to_vec[word_key])) distances.append((index, word_key, cos_sim)) sorted_by_distance = sorted(distances, reverse=True, key=lambda tup: tup[2]) for matching_word in sorted_by_distance[:n]: print(matching_word[1])
def search(self, query): query_repr = read_ap.process_text(query) vec_query = self.corpus.dictionary.doc2bow(query_repr) lda_query = sparse2full(self.model[vec_query], self.num_topics) results = defaultdict(float) for doc_id, lda_doc_repr in zip(self.corpus.doc_ids, self.lda_corpus_pers): results[doc_id] = kullback_leibler(lda_query, lda_doc_repr) results = { k: v for k, v in sorted( results.items(), key=lambda item: item[1], reverse=True) } return list(results.items())
def query_similarity(query, dictionary, model, index, doc_ids): """ Return the ranking of relevant docs given a query. """ query = ra.process_text(query) vec_bow = dictionary.doc2bow(query) vec_lsi = model[vec_bow] sims = index[vec_lsi] scores = {} for i, score in enumerate(sims): score = score.item() doc_id = doc_ids[i] scores[doc_id] = score ranking = dict( sorted(scores.items(), key=operator.itemgetter(1), reverse=True)) return ranking
def rank(self, query, first_query=True): query_repr = read_ap.process_text(query) vec_bow = self.index.doc2bow(query_repr) if self.tfidf: vec_bow = bow2tfidf(vec_bow, self.index) vec_lsi = self.model[vec_bow] # convert the query to LSI space index_path = os.path.join(self.model_path, 'lsi_index_rank.index') if first_query: # and not os.path.exists(os.path.join(self.model_path, 'lsi_index_rank.index')): used_corpus = self.corpus_tfidf if self.tfidf else self.corpus_bow index = similarities.Similarity(os.path.join(self.model_path,"shard"), self.model[used_corpus], self.num_topics) #len(self.index)) # transform corpus to LSI space and index it index.save(index_path) else: index = similarities.Similarity.load(index_path) sims = index[vec_lsi] # query similarity sims = sorted(enumerate(sims), key=lambda item: -item[1]) sims = [(self.index2docid[idx], np.float64(value)) for (idx, value) in sims] return sims
def ranking_LDA(query, model, model_docs, num_topics=10): scores = [] # Process query to correct KL divergence form query = read_ap.process_text(query) query = dictionary.doc2bow(query) query = model[query] query = gensim.matutils.sparse2full(query, num_topics) # Calculate KL divergence for each document in the corpus for i in range(len(corpus)): doc = model_docs[i] neg_kl = float(-1 * kullback_leibler(query, doc)) scores.append((i2str[i], neg_kl)) # Sort on second tuple value scores = sorted(scores, key=lambda x: x[1], reverse=True) return scores
def create_doc_emb(matrix, doc, word2id, id2word): ''' Takes a list of words, converts these words to id's, computes the word embedding for each word and sums these embeddings to get the document representation Returns: doc_emb: array [emb_dim] ''' embeddings = [] for word in doc: word = ra.process_text(word) if len(word) == 1: word_id = word2id.get(word[0]) if word_id != None: word_embedding = matrix[word_id, :].reshape(1, -1) embeddings.append(word_embedding) embeddings = np.asarray(embeddings) doc_emb = embeddings.mean(axis=0) return doc_emb
def search_doc2vec(model, query, docs_by_id=None, result_len=MAX_NUMBER_OF_RESULTS): if docs_by_id is None: docs_by_id = read_ap.get_processed_docs() # Deleting training data is advice by the official gensim website. model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) print("Comparing the query embedding with all document embeddings...") # Get cosine similarity for the query compared to the documents. q_vec = model.infer_vector([q_tok for q_tok in read_ap.process_text(query)]) q_vec = torch.FloatTensor(q_vec).unsqueeze(dim=0) cos = torch.nn.CosineSimilarity() results = {} for doc_id, doc in docs_by_id.items(): vec = torch.FloatTensor(model.infer_vector(doc)).unsqueeze(dim=0) results[doc_id] = float(cos(vec, q_vec)) # Rank the top results in a list. results = list(results.items()) results.sort(key=lambda _: _[1]) return results[:result_len]
def get_sims(model, query, corpus_full, dictionary, n_topics): ''' get ranking for single query ''' # avoid division by 0 eps = 1e-8 # process query query_processed = read_ap.process_text(query) query_bow = dictionary.doc2bow(query_processed) q_lda = sparse2full(model[query_bow], n_topics) q_lda += eps sims = [] # loop over all docs for i, doc in enumerate(corpus_full): doc += eps sim = -1 * kullback_leibler(q_lda, doc) sims.append(sim) sim_ordered = sorted(enumerate(sims), key=lambda item: -1 * item[1]) return sim_ordered
def evaluate(config, qrels, queries): if not os.path.exists(config.model_file): raise ValueError("no model available for search, try setting '-t' to true to train model first") else: model = gensim.models.doc2vec.Doc2Vec.load(config.model_file) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # read in the qrels overall_ser = {} print("Running TFIDF Benchmark") # collect results results_lines = [] for qid in tqdm(qrels): query_text = queries[qid] vector = model.infer_vector(read_ap.process_text(query_text)) results = model.docvecs.most_similar([vector], topn=164557) to_write = [str(qid)+ '\tQO\t' + doc_id + '\t0\t' + str(score) + '\tSTANDARD\n' for doc_id, score in results] with smart_open.open(config.write_file, 'a') as f: f.writelines(to_write)
def rank_docs(query, model, doc_ids, dictionary, corpus_modelspace, tfidf_model=None, index=None): query_prepro = read_ap.process_text(query) # transform query to bow vector space q_cspace = dictionary.doc2bow(query_prepro) if not tfidf_model == None: # transform query to tfidf vector space q_cspace = tfidf_model[q_cspace] q_modelspace = model[q_cspace] if isinstance(model, LsiModel): ## LSI scores = index[q_modelspace] results = defaultdict(float) for doc_id, score in zip(doc_ids, scores): results[doc_id] = score results = list(results.items()) results.sort(key=lambda _: -_[1]) elif isinstance(model, LdaModel): ## LDA doc_ids = list(doc_ids) scores = [] # have to use the for loop, otherwise kullback_leibler has problems for d in corpus_modelspace: scores.append(float(-kullback_leibler(q_modelspace, d))) # have to use torch here to do this more efficiently order = torch.Tensor(scores).argsort(descending=True).numpy() ordered_results = [(doc_ids[i], scores[i]) for i in order] results = dict(ordered_results) return results