entityv_weights.append( np.concatenate((model.trainables.syn1neg[vocab.index], model.trainables.syn1neg[vocab.index]))) elif not entity.startswith('<'): wordv_entities.append(entity) if args.words == 'in': wordv_weights.append(model.wv.vectors[vocab.index]) elif args.words == 'out': wordv_weights.append(model.trainables.syn1neg[vocab.index]) elif args.words == 'inout': wordv_weights.append( np.concatenate((model.wv.vectors[vocab.index], model.trainables.syn1neg[vocab.index]))) entityv.add(entityv_entities, entityv_weights) wordv.add(wordv_entities, wordv_weights) entityv.init_sims() # wordv.init_sims() # print(entityv.most_similar(positive=[wordv['detroit']])) with open(args.outfile, 'w') as out_file: for qid, qtokens in queries.items(): positive = [] for token in qtokens: token = token.lower() if token in wordv.vocab: weight = args.a / (args.a + word_probs[token]) positive.append( wordv.word_vec(token, use_norm=args.norm) * weight) else: print('token {} not in vocab'.format(token))
entityv_entities.append(entity[7:]) entityv_weights.append(embedding) elif not (entity.startswith('entity:') or entity.startswith('relation:')): wordv_entities.append(entity) wordv_weights.append(embedding) print('entities:', entityv_entities[:4]) print('words:', wordv_entities[:4]) entityv = KeyedVectors(entityv_weights[0].shape[0]) entityv.add(entityv_entities, entityv_weights) wordv = KeyedVectors(wordv_weights[0].shape[0]) wordv.add(wordv_entities, wordv_weights) wordv.init_sims() print(entityv.most_similar(positive=[wordv['detroit']])) with open(args.outfile, 'w') as out_file: for qid, qtokens in queries.items(): if args.el: if args.elremove: for entity in qid_entities[qid]['entities']: if '<{}>'.format(entity) in entityv: qtokens = list( set(qtokens) - set(qid_entities[qid]['surface_tokens'][entity])) else: print( 'not removing tokens for entity {} because it doesn\'t have an embedding'
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str, folds: Union[str, list], top_ks: Union[str, list], output_path: str): """ Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv :param aspect: :param folds: :param top_ks: :param name: :param hf_dataset: :param input_path: :param output_path: :return: """ if isinstance(folds, str): folds = folds.split(',') elif isinstance(folds, int): folds = [folds] if isinstance(top_ks, str): top_ks = top_ks.split(',') elif isinstance(top_ks, int): top_ks = [top_ks] logger.info(f'Folds: {folds}') logger.info(f'Top-Ks: {top_ks}') if len(folds) < 1: logger.error('No folds provided') return if len(top_ks) < 1: logger.error('No top-k values provided') return # Load documents doc_model = KeyedVectors.load_word2vec_format(input_path) logger.info(f'Document vectors: {doc_model.vectors.shape}') # Normalize vectors doc_model.init_sims(replace=True) # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank' ] df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics) # Iterate over folds for fold in folds: logger.info(f'Current fold: {fold}') # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs] # true_relevances = [1] * len(rel_docs) # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') # # # df = pd.DataFrame(k2eval_rows[top_k], # columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', # 'precision', 'recall', 'avg_p', 'reciprocal_rank']) # # print(df.mean()) # # print(df.mean().to_frame().transpose().iloc[0]) logger.info(f'Writing {len(df)} rows to {output_path}') if os.path.exists(output_path): # Append new rows to evaluation file df.to_csv(output_path, mode='a', header=False, index=False) else: # Write new files df.to_csv(output_path, header=True, index=False) logger.info('Done')
def get_evaluation_df(name, doc_model, hf_dataset, aspect, fold) -> Tuple[DataFrame, Dict]: # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg' ] df = pd.DataFrame([], columns=['name', 'aspect', 'fold', 'top_k'] + metrics) # Dataset test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir='./data/nlp_cache', split=get_test_split(aspect, fold)) logger.info(f'Test samples: {len(test_ds):,}') # Unique paper IDs in test set test_paper_ids = set(test_ds['from_paper_id']).union( set(test_ds['to_paper_id'])) logger.info(f'Test paper IDs: {len(test_paper_ids):,}') logger.info(f'Examples: {list(test_paper_ids)[:10]}') # Relevance mapping doc_id2related_ids = defaultdict(set) # type: Dict[Set[str]] for row in test_ds: if row['label'] == 'y': a = row['from_paper_id'] b = row['to_paper_id'] doc_id2related_ids[a].add(b) doc_id2related_ids[b].add(a) # Filter for documents in test set test_doc_model = KeyedVectors(vector_size=doc_model.vector_size) test_doc_ids = [] test_doc_vectors = [] missed_doc_ids = 0 for doc_id in doc_model.vocab: if doc_id in test_paper_ids: vec = doc_model.get_vector(doc_id) if len(vec) != doc_model.vector_size: raise ValueError( f'Test document as invalid shape: {doc_id} => {vec.shape}' ) test_doc_ids.append(doc_id) test_doc_vectors.append(vec) else: missed_doc_ids += 1 # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})') if len(test_doc_ids) != len(test_doc_vectors): raise ValueError( f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}' ) logger.info( f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})' ) logger.info(f'Test document vectors: {len(test_doc_vectors)}') test_doc_model.add(test_doc_ids, test_doc_vectors) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Actual evaluation # k2eval_rows = defaultdict(list) seed_ids_without_recommendations = [] max_top_k = max(top_ks) eval_rows = {top_k: defaultdict(list) for top_k in top_ks } # top_k => metric_name => list of value seed_id2ret_docs = {} for seed_id in tqdm( test_paper_ids, desc=f'Evaluation ({name},aspect={aspect},fold={fold})'): try: rel_docs = doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] seed_id2ret_docs[seed_id] = max_ret_docs for top_k in top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # # NDCG@k predicted_relevance = [ 1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs ] true_relevances = [1] * len(rel_docs) ndcg_value = compute_dcg_at_k( predicted_relevance, top_k) / compute_dcg_at_k( true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) eval_rows[top_k]['ndcg'].append(ndcg_value) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in top_ks: try: row = [name, aspect, fold, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error( f'Cannot summarize row: {top_k} {fold} {metrics} {e}') return df, seed_id2ret_docs
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: """ This is called during training to evaluate the model. It returns a score for the evaluation with a higher score indicating a better result. :param model: the model to evaluate :param output_path: path where predictions and metrics are written to :param epoch the epoch where the evaluation takes place. This is used for the file prefixes. If this is -1, then we assume evaluation on test data. :param steps the steps in the current epoch at time of the evaluation. This is used for the file prefixes. If this is -1, then we assume evaluation at the end of the epoch. :return: a score for the evaluation with a higher score indicating a better result """ # idx2paper_id = {} # paper_id2idx = {} # texts = [] # paper_ids = [] # # # get document texts # for idx, paper_id in enumerate(self.test_paper_ids): # idx2paper_id[idx] = paper_id # paper_id2idx[paper_id] = idx # # doc = self.doc_id2doc[paper_id] # texts.append(doc['title'] + ': ' + doc['abstract']) # paper_ids.append(paper_id) logger.info('Encode test documents...') embeddings = model.encode(self.tokenized_texts, is_pretokenized=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) # Filter for documents in test set test_doc_model = KeyedVectors( vector_size=model.get_sentence_embedding_dimension()) #for idx, embedding in enumerate(embeddings): # test_doc_model.add([idx2paper_id[idx]], [embedding]) test_doc_model.add(self.paper_ids, embeddings.tolist()) test_doc_model.init_sims(replace=True) logger.info(f'Test document vectors: {test_doc_model.vectors.shape}') # Init dataframe metrics = [ 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs', 'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg' ] df = pd.DataFrame([], columns=['epoch', 'steps', 'top_k'] + metrics) max_top_k = max(self.top_ks) eval_rows = {top_k: defaultdict(list) for top_k in self.top_ks } # top_k => metric_name => list of value seed_ids_without_recommendations = [] for seed_id in tqdm(self.test_paper_ids, desc=f'Evaluation'): try: rel_docs = self.doc_id2related_ids[seed_id] max_ret_docs = [ d for d, score in test_doc_model.most_similar(seed_id, topn=max_top_k) ] for top_k in self.top_ks: ret_docs = max_ret_docs[:top_k] rel_ret_docs_count = len(set(ret_docs) & set(rel_docs)) if ret_docs and rel_docs: # Precision = No. of relevant documents retrieved / No. of total documents retrieved precision = rel_ret_docs_count / len(ret_docs) # Recall = No. of relevant documents retrieved / No. of total relevant documents recall = rel_ret_docs_count / len(rel_docs) # Avg. precision (for MAP) avg_p = get_avg_precision(ret_docs, rel_docs) # Reciprocal rank (for MRR) reciprocal_rank = get_reciprocal_rank( ret_docs, rel_docs) # NDCG@k predicted_relevance = [ 1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs ] true_relevances = [1] * len(rel_docs) ndcg_value = self.compute_dcg_at_k( predicted_relevance, top_k) / self.compute_dcg_at_k( true_relevances, top_k) # Save metrics eval_rows[top_k]['retrieved_docs'].append( len(ret_docs)) eval_rows[top_k]['relevant_docs'].append(len(rel_docs)) eval_rows[top_k]['relevant_retrieved_docs'].append( rel_ret_docs_count) eval_rows[top_k]['precision'].append(precision) eval_rows[top_k]['recall'].append(recall) eval_rows[top_k]['avg_p'].append(avg_p) eval_rows[top_k]['reciprocal_rank'].append( reciprocal_rank) eval_rows[top_k]['ndcg'].append(ndcg_value) except (IndexError, ValueError, KeyError) as e: seed_ids_without_recommendations.append(seed_id) logger.warning( f'Cannot retrieve recommendations for #{seed_id}: {e}') logger.info( f'Completed with {len(eval_rows[self.top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})' ) # Summarize evaluation for top_k in self.top_ks: try: row = [epoch, steps, top_k] for metric in metrics: # mean over all metrics values = eval_rows[top_k][metric] if len(values) > 0: row.append(np.mean(values)) else: row.append(None) df.loc[len(df)] = row except ValueError as e: logger.error(f'Cannot summarize row: {top_k} {metrics} {e}') output_csv_path = os.path.join(output_path, self.csv_file) logger.info(f'Writing {len(df)} rows to {output_csv_path}') logger.info(f'Results:\n{df.to_markdown()}') if os.path.exists(output_csv_path): # Append new rows to evaluation file df.to_csv(output_csv_path, mode='a', header=False, index=False) else: # Write new files df.to_csv(output_csv_path, header=True, index=False) # Return score from main metric if len(df) > 0: main_score = df.iloc[0][self.main_metric] logger.info( f'Evaluation completed: {self.main_metric} = {main_score}') return main_score else: logger.warning('No evaluation rows available... score = 0') return 0