def calc_aggregate_reviewer_score(rdb, all_scores, operator='max'): """Calculate the aggregate reviewer score for one paper :param rdb: Reviewer DB. NP matrix of DB papers by reviewers :param scores: NP matrix of similarity scores between the current papers (rows) and the DB papers (columns) :param operator: Which operator to apply (max, weighted_topK) :return: Numpy matrix of length reviewers indicating the score for that reviewer """ agg = np.zeros((all_scores.shape[0], rdb.shape[1])) print( f'Calculating aggregate scores for {all_scores.shape[0]} examples (.=10 examples)', file=sys.stderr) for i in range(all_scores.shape[0]): scores = all_scores[i] INVALID_SCORE = 0 # slow -- 2-3 secs scored_rdb = rdb * scores.reshape( (len(scores), 1)) + (1 - rdb) * INVALID_SCORE if operator == 'max': agg[i] = np.amax(scored_rdb, axis=0) elif operator.startswith('weighted_top'): k = int(operator[12:]) weighting = np.reshape(1 / np.array(range(1, k + 1)), (k, 1)) # slow -- 2-3 secs scored_rdb.sort(axis=0) topk = scored_rdb[-k:, :] agg[i] = (topk * weighting).sum(axis=0) else: raise ValueError(f'Unknown operator {operator}') print_progress(i, mod_size=10) print('', file=sys.stderr) return agg
def create_embeddings(model, examps): """Embed textual examples :param examps: A list of text to embed :return: A len(examps) by embedding size numpy matrix of embeddings """ # Preprocess examples print(f'Preprocessing {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr) data = [] for i, line in enumerate(examps): p1 = " ".join(entok.tokenize(line, escape=False)).lower() if model.sp is not None: p1 = model.sp.EncodeAsPieces(p1) p1 = " ".join(p1) wp1 = Example(p1) wp1.populate_embeddings(model.vocab, model.zero_unk, model.args.ngrams) if len(wp1.embeddings) == 0: wp1.embeddings.append(model.vocab[unk_string]) data.append(wp1) print_progress(i, BATCH_SIZE) print("", file=sys.stderr) # Create embeddings print(f'Embedding {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr) embeddings = np.zeros( (len(examps), model.args.dim) ) for i in range(0, len(data), BATCH_SIZE): max_idx = min(i+BATCH_SIZE,len(data)) curr_batch = data[i:max_idx] wx1, wl1 = model.torchify_batch(curr_batch) vecs = model.encode(wx1, wl1) vecs = vecs.detach().cpu().numpy() vecs = vecs / np.sqrt((vecs * vecs).sum(axis=1))[:, None] #normalize for NN search embeddings[i:max_idx] = vecs print_progress(i, BATCH_SIZE) print("", file=sys.stderr) return embeddings
import json import argparse from sacremoses import MosesTokenizer import suggest_utils parser = argparse.ArgumentParser() parser.add_argument("--infile", help="input json file") parser.add_argument("--outfile", help="output file of text, 1 per line") args = parser.parse_args() with open(args.infile, "r") as f: data = [json.loads(x) for x in f] entok = MosesTokenizer(lang='en') abstracts = [] for i in data: abstracts.append(i['paperAbstract']) outfile = open(args.outfile, 'w') print('Tokenizing abstracts (.=100 abstracts)') for i, abs in enumerate(abstracts): abs = abs.strip() text = entok.tokenize(abs, escape=False) text = " ".join(text).lower() outfile.write(text + "\n") suggest_utils.print_progress(i, 100)