def query(): w = pickle.load(open('weights_from_query.pkl', 'rb')).squeeze() topk_vals, topk_idxs = torch.topk(w, 30) bottomk_vals, bottomk_idxs = torch.topk(-w, 30) docs, lookup = pickle.load(open('parsed_robust_queries.pkl', 'rb')) tf, df, idf = count_me(docs) inv_lookup = _.invert(lookup) print('Top30: ', [inv_lookup[idx] for idx in topk_idxs.tolist()]) print('Bottom30: ', [inv_lookup[idx] for idx in bottomk_idxs.tolist()]) glove = get_glove_lookup() glove_by_idx = _.map_keys( glove, lambda vec, token: lookup[token] if token in lookup else lookup['<unk>']) norms_by_idx = _.map_values(glove_by_idx, torch.norm) idxs_in_order = list(norms_by_idx.keys()) idfs_in_order = torch.tensor([idf[idx] for idx in idxs_in_order]) dfs_in_order = torch.tensor([df[idx] for idx in idxs_in_order]) tfs_in_order = torch.tensor([tf[idx] for idx in idxs_in_order]) norms_in_order = torch.tensor([norms_by_idx[idx] for idx in idxs_in_order]) w_subset = w[torch.tensor(idxs_in_order)] print(np.corrcoef(w_subset, tfs_in_order)[0, 1]) print(np.corrcoef(w_subset, dfs_in_order)[0, 1]) print(np.corrcoef(w_subset, idfs_in_order)[0, 1]) print(np.corrcoef(w_subset, norms_in_order)[0, 1]) print(np.corrcoef(w_subset, np.log(tfs_in_order + 1))[0, 1]) print(np.corrcoef(w_subset, np.log(dfs_in_order))[0, 1]) print(np.corrcoef(w_subset, np.log(idfs_in_order))[0, 1]) print(np.corrcoef(w_subset, np.log(norms_in_order + 1))[0, 1])
def main(): path = './indri/robust_train_query_params_without_unks.xml' try: os.remove(path) except OSError: pass with open('./caches/pairwise_train_ranking_106756.json') as fh: query_ranking_pairs = json.load(fh) queries_by_tok_id, qml = zip(*query_ranking_pairs) parsed_queries, query_token_lookup = read_cache( './parsed_robust_queries_dict.json', lambda: print('failed')) inv = _.invert(query_token_lookup) queries = [ ' '.join([inv[q] for q in query]) for query in queries_by_tok_id ] with open(path, 'a+') as fh: fh.write('<parameters>\n') for query_name, query_text in enumerate(queries): query_name = str(query_name + 1) if len(query_text) == 0: continue fh.write('<query>\n') fh.write('<number>' + query_name + '</number>\n') fh.write('<text>\n') fh.write('#combine( ' + query_text + ' )\n') fh.write('</text>\n') fh.write('</query>\n') fh.write('</parameters>\n')
def get_other_results(queries, qml_rankings, num_ranks=None): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) tokenized_queries = tokenizer.process_all(queries) bm25 = BM25(tokenized_documents) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) bm25_rankings = [] glove_rankings = [] rm3_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) docs_lms = _calc_docs_lms(bm25.df, bm25.f) for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)): bm25_rankings.append( _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf)) glove_rankings.append( _get_glove_ranking(glove, tokenized_documents, qml_ranking, q)) rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q)) return bm25_rankings, glove_rankings, rm3_rankings
def get_doc_encoder_and_embeddings(document_token_lookup, only_use_last_out=False): emb_sz = 400 n_hid = 1150 n_layers = 3 pad_token = 1 model = get_language_model(len(document_token_lookup), emb_sz, n_hid, n_layers, pad_token) wgts = torch.load('lstm_wt103.pth', map_location=lambda storage, loc: storage) with open('./itos_wt103.pkl', 'rb') as fh: old_itos = pickle.load(fh) old_stoi = _.invert(old_itos) string_lookup = _.invert(document_token_lookup) wgts = convert_weights( wgts, old_stoi, [string_lookup[i] for i in range(len(document_token_lookup))]) model.load_state_dict(wgts) rnn_enc = model[0] embedding = rnn_enc.encoder return SequentialRNN(rnn_enc, OutPooler(only_use_last_out)), embedding
def main(): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = create_id_lookup(document_lookup.keys()) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids] tokenizer = Tokenizer() tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) bm25 = BM25(tokenized_documents) with open('./doc_word_idf.json', 'w+') as fh: json.dump(bm25.idf, fh)
def baselines_eval(): rankings_to_eval = read_query_test_rankings() qrels = parse_qrels() query_ids = list(qrels.keys()) query_lookup = get_robust_eval_queries() queries = [query_lookup[query_id] for query_id in query_ids] k = 10 if len(sys.argv) == 1 else int(sys.argv[1]) document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']), get_robust_documents_with_titles) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) ordered_rankings_to_eval = [[ document_title_to_id[title] for title in rankings_to_eval[query] ] for query in query_ids] ordered_qrels = [[document_title_to_id[title] for title in qrels[query]] for query in query_ids] document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache( 'tok_docs.json', lambda: tokenizer.process_all(clean_documents(documents))) tokenized_queries = tokenizer.process_all(clean_documents(queries)) bm25 = gensim_bm25.BM25(tokenized_documents) # with open('./caches/106756_most_common_doc.json', 'r') as fh: # doc_token_set = set(json.load(fh)) # corpus, token_lookup = tokens_to_indexes(tokenized_documents, # None, # token_set=doc_token_set) # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f] # tfidf = TfidfModel(corpus) # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300) glove_rankings = [] # lsi_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) encoded_docs = torch.stack( [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f]) encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1) for q, qml_ranking in progressbar(zip(tokenized_queries, ordered_rankings_to_eval), max_value=len(tokenized_queries)): doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None glove_rankings.append( rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids)) # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids)) print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k)) print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))
def prepare(lookup, title_to_id, token_lookup=None, num_tokens=None, token_set=None, drop_if_any_unk=False): id_to_title_lookup = _.invert(title_to_id) ids = range(len(id_to_title_lookup)) contents = [lookup[id_to_title_lookup[id]] for id in ids] numericalized, token_lookup = preprocess_texts( contents, token_lookup=token_lookup, num_tokens=num_tokens, token_set=token_set, drop_if_any_unk=drop_if_any_unk) return numericalized, token_lookup
def __init__(self, cursor, lookups_path, train_size): self.cursor = cursor lookups = load_entity_candidate_ids_and_label_lookup( lookups_path, train_size) label_to_entity_id = _.invert(lookups['entity_labels']) self.entity_candidates_prior = { entity_text: { label_to_entity_id[label]: candidates for label, candidates in prior.items() } for entity_text, prior in lookups['entity_candidates_prior'].items() } self.prior_approx_mapping = u.get_prior_approx_mapping( self.entity_candidates_prior) self.mentions = None self.labels = None self.mention_doc_id = None
def prepare_fs(lookup, title_to_id, token_lookup=None, token_set=None, num_tokens=None, drop_if_any_unk=False): id_to_title_lookup = _.invert(title_to_id) ids = range(len(id_to_title_lookup)) contents = [lookup[id_to_title_lookup[id]] for id in ids] if num_tokens == -1: num_tokens = None numericalized, token_lookup = preprocess_texts( contents, token_lookup=token_lookup, token_set=token_set, num_tokens=num_tokens, drop_if_any_unk=drop_if_any_unk) numericalized_fs = [Counter(doc) for doc in numericalized] return numericalized_fs, token_lookup
def _get_tester(self, cursor, model): logits_and_softmax = self._get_logits_and_softmax() test_dataset = self._get_dataset(cursor, is_test=True) self._dataset = test_dataset batch_sampler = self._get_sampler(cursor, is_test=True) return Tester( dataset=test_dataset, batch_sampler=batch_sampler, model=model, logits_and_softmax=logits_and_softmax, embedding=self.lookups.embedding, token_idx_lookup=self.lookups.token_idx_lookup, device=self.device, experiment=self.experiment, ablation=self.model_params.ablation, use_adaptive_softmax=self.model_params.use_adaptive_softmax, use_wiki2vec=self.model_params.use_wiki2vec, use_sum_encoder=self.model_params.use_sum_encoder, label_to_entity_id=_.invert(self.lookups.entity_labels), use_stacker=self.model_params.use_stacker)
def main(): with open('./caches/pairwise_train_ranking_106756.json') as fh: query_ranking_pairs = json.load(fh) queries_by_tok_id, qml = zip(*query_ranking_pairs) parsed_queries, query_token_lookup = read_cache( './parsed_robust_queries_dict.json', lambda: print('failed')) inv = _.invert(query_token_lookup) queries = [ ' '.join([inv[q] for q in query]) for query in queries_by_tok_id ] if len(sys.argv) > 1: lim = int(sys.argv[1]) else: lim = None bm25_rankings, glove_rankings, rm3_rankings = get_other_results( queries[:lim], qml[:lim]) agree_ctr, num_combos = check_overlap(qml[:lim], bm25_rankings) print(agree_ctr, num_combos, agree_ctr / num_combos) agree_ctr, num_combos = check_overlap(qml[:lim], glove_rankings) print(agree_ctr, num_combos, agree_ctr / num_combos) agree_ctr, num_combos = check_overlap(qml[:lim], rm3_rankings) print(agree_ctr, num_combos, agree_ctr / num_combos)
def test_invert(case, expected): assert _.invert(case) == expected
def __init__(self, cursor, token_idx_lookup, full_token_idx_lookup, lookups_path, idf_path, train_size, txt_dataset_path, pkl_dataset_prefix=None): self.txt_dataset_path = txt_dataset_path self.pkl_dataset_prefix = pkl_dataset_prefix if self.pkl_dataset_prefix is not None: self.current_part = None return if self.txt_dataset_path is not None: if '.pkl' in self.txt_dataset_path: with open(self.txt_dataset_path, 'rb') as fh: self.dataset_cache = pickle.load(fh) return with open(self.txt_dataset_path) as fh: self.dataset_cache = [ ast.literal_eval(line) for line in fh.readlines() ] return with open(idf_path) as fh: self.idf = json.load(fh) self.cursor = cursor with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) self.desc_fs = DocLookup('./desc_fs.npz', entity_id_to_row, token_idx_mapping=_.invert(token_idx_lookup), default_value={}, use_default=True) self.desc_fs_unstemmed = DocLookup( './desc_unstemmed_fs.npz', entity_id_to_row, token_idx_mapping=_.invert(full_token_idx_lookup), default_value={'<PAD>': 1}, use_default=True) self.embedding_dict = get_embedding_dict('./glove.6B.300d.txt', embedding_dim=300) self.stemmer = SnowballStemmer('english') lookups = load_entity_candidate_ids_and_label_lookup( lookups_path, train_size) label_to_entity_id = _.invert(lookups['entity_labels']) self.entity_candidates_prior = { entity_text: { label_to_entity_id[label]: candidates for label, candidates in prior.items() } for entity_text, prior in lookups['entity_candidates_prior'].items() } self.prior_approx_mapping = u.get_prior_approx_mapping( self.entity_candidates_prior) self.mentions = None self.labels = None self.mention_doc_id = None self.mention_sentences = None self.mention_fs = None self.mention_fs_unstemmed = None self.page_f_lookup = None self.with_labels = None self._candidate_strs_lookup = read_cache( './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor)) self.stopwords = set(nltk_stopwords.words('english'))
def main(): p = get_cli_args(args) with open('./tokens.pkl', 'rb') as fh: token_idx_lookup = pickle.load(fh) load_dotenv(dotenv_path=p.run.env_path) EL_DATABASE_NAME = os.getenv("DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") with open(p.train.page_id_order_path, 'rb') as fh: page_id_order = pickle.load(fh) page_ids = page_id_order[:p.train.num_pages_to_use] connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") datasets = [ MentionCoNLLDataset(cursor, './AIDA-YAGO2-dataset.tsv', p.run.lookups_path, p.train.train_size), MentionWikiDataset(cursor, page_ids, p.run.lookups_path, p.train.train_size) ] with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) idf = get_idf(token_idx_lookup, p.run.idf_path) desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz')) desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse)) norm = (desc_vs.multiply(desc_vs)).sum(1) all_e_id_pairs = set() data = [] i = [] j = [] row_to_entity_id = _.invert(entity_id_to_row) for dataset in datasets: for cands in progressbar(iter(dataset)): if cands is None: continue cand_rows = [ entity_id_to_row[e_id] for e_id in cands if (e_id in entity_id_to_row) ] cand_mat = desc_vs[cand_rows] scores = cand_mat.dot(cand_mat.T) / norm[cand_rows] new_i = cand_rows * len(cand_rows) new_j = [ row_num for row_num in cand_rows for __ in range(len(cand_rows)) ] list_scores = np.array(scores).ravel().tolist() for res_i in range(len(list_scores)): pair = (row_to_entity_id[min(new_i[res_i], new_j[res_i])], row_to_entity_id[max(new_i[res_i], new_j[res_i])]) if pair not in all_e_id_pairs: data.append(list_scores[res_i]) i.append(new_i[res_i]) j.append(new_j[res_i]) mat = csr_matrix(coo_matrix((data, (i, j)))) train_str = 'wiki+conll_' + '_'.join([str(p.train.num_pages_to_use)]) save_npz('compats_{}.npz'.format(train_str), mat)
def test_invert_multivalue(case, expected): result = _.invert(case, multivalue=True) for key in result: assert set(result[key]) == set(expected[key])