Example #1
0
def query():
    w = pickle.load(open('weights_from_query.pkl', 'rb')).squeeze()
    topk_vals, topk_idxs = torch.topk(w, 30)
    bottomk_vals, bottomk_idxs = torch.topk(-w, 30)
    docs, lookup = pickle.load(open('parsed_robust_queries.pkl', 'rb'))
    tf, df, idf = count_me(docs)
    inv_lookup = _.invert(lookup)
    print('Top30: ', [inv_lookup[idx] for idx in topk_idxs.tolist()])
    print('Bottom30: ', [inv_lookup[idx] for idx in bottomk_idxs.tolist()])
    glove = get_glove_lookup()
    glove_by_idx = _.map_keys(
        glove, lambda vec, token: lookup[token]
        if token in lookup else lookup['<unk>'])
    norms_by_idx = _.map_values(glove_by_idx, torch.norm)
    idxs_in_order = list(norms_by_idx.keys())
    idfs_in_order = torch.tensor([idf[idx] for idx in idxs_in_order])
    dfs_in_order = torch.tensor([df[idx] for idx in idxs_in_order])
    tfs_in_order = torch.tensor([tf[idx] for idx in idxs_in_order])
    norms_in_order = torch.tensor([norms_by_idx[idx] for idx in idxs_in_order])
    w_subset = w[torch.tensor(idxs_in_order)]
    print(np.corrcoef(w_subset, tfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, dfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, idfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, norms_in_order)[0, 1])
    print(np.corrcoef(w_subset, np.log(tfs_in_order + 1))[0, 1])
    print(np.corrcoef(w_subset, np.log(dfs_in_order))[0, 1])
    print(np.corrcoef(w_subset, np.log(idfs_in_order))[0, 1])
    print(np.corrcoef(w_subset, np.log(norms_in_order + 1))[0, 1])
Example #2
0
def main():
    path = './indri/robust_train_query_params_without_unks.xml'
    try:
        os.remove(path)
    except OSError:
        pass
    with open('./caches/pairwise_train_ranking_106756.json') as fh:
        query_ranking_pairs = json.load(fh)
        queries_by_tok_id, qml = zip(*query_ranking_pairs)
    parsed_queries, query_token_lookup = read_cache(
        './parsed_robust_queries_dict.json', lambda: print('failed'))
    inv = _.invert(query_token_lookup)
    queries = [
        ' '.join([inv[q] for q in query]) for query in queries_by_tok_id
    ]
    with open(path, 'a+') as fh:
        fh.write('<parameters>\n')
        for query_name, query_text in enumerate(queries):
            query_name = str(query_name + 1)
            if len(query_text) == 0: continue
            fh.write('<query>\n')
            fh.write('<number>' + query_name + '</number>\n')
            fh.write('<text>\n')
            fh.write('#combine( ' + query_text + ' )\n')
            fh.write('</text>\n')
            fh.write('</query>\n')
        fh.write('</parameters>\n')
Example #3
0
def get_other_results(queries, qml_rankings, num_ranks=None):
    document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache('tok_docs.json',
                                     lambda: tokenizer.process_all(documents))
    tokenized_queries = tokenizer.process_all(queries)
    bm25 = BM25(tokenized_documents)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
    bm25_rankings = []
    glove_rankings = []
    rm3_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    docs_lms = _calc_docs_lms(bm25.df, bm25.f)
    for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)):
        bm25_rankings.append(
            _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf))
        glove_rankings.append(
            _get_glove_ranking(glove, tokenized_documents, qml_ranking, q))
        rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q))
    return bm25_rankings, glove_rankings, rm3_rankings
Example #4
0
def get_doc_encoder_and_embeddings(document_token_lookup,
                                   only_use_last_out=False):
    emb_sz = 400
    n_hid = 1150
    n_layers = 3
    pad_token = 1
    model = get_language_model(len(document_token_lookup), emb_sz, n_hid,
                               n_layers, pad_token)
    wgts = torch.load('lstm_wt103.pth',
                      map_location=lambda storage, loc: storage)
    with open('./itos_wt103.pkl', 'rb') as fh:
        old_itos = pickle.load(fh)
    old_stoi = _.invert(old_itos)
    string_lookup = _.invert(document_token_lookup)
    wgts = convert_weights(
        wgts, old_stoi,
        [string_lookup[i] for i in range(len(document_token_lookup))])
    model.load_state_dict(wgts)
    rnn_enc = model[0]
    embedding = rnn_enc.encoder
    return SequentialRNN(rnn_enc, OutPooler(only_use_last_out)), embedding
Example #5
0
def main():
  document_lookup = read_cache('./doc_lookup.json', get_robust_documents)
  document_title_to_id = create_id_lookup(document_lookup.keys())
  document_id_to_title = _.invert(document_title_to_id)
  doc_ids = range(len(document_id_to_title))
  documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids]
  tokenizer = Tokenizer()
  tokenized_documents = read_cache('tok_docs.json',
                                   lambda: tokenizer.process_all(documents))
  bm25 = BM25(tokenized_documents)
  with open('./doc_word_idf.json', 'w+') as fh:
    json.dump(bm25.idf, fh)
Example #6
0
def baselines_eval():
    rankings_to_eval = read_query_test_rankings()
    qrels = parse_qrels()
    query_ids = list(qrels.keys())
    query_lookup = get_robust_eval_queries()
    queries = [query_lookup[query_id] for query_id in query_ids]
    k = 10 if len(sys.argv) == 1 else int(sys.argv[1])
    document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']),
                                 get_robust_documents_with_titles)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    ordered_rankings_to_eval = [[
        document_title_to_id[title] for title in rankings_to_eval[query]
    ] for query in query_ids]
    ordered_qrels = [[document_title_to_id[title] for title in qrels[query]]
                     for query in query_ids]
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache(
        'tok_docs.json',
        lambda: tokenizer.process_all(clean_documents(documents)))
    tokenized_queries = tokenizer.process_all(clean_documents(queries))
    bm25 = gensim_bm25.BM25(tokenized_documents)
    # with open('./caches/106756_most_common_doc.json', 'r') as fh:
    #   doc_token_set = set(json.load(fh))
    # corpus, token_lookup = tokens_to_indexes(tokenized_documents,
    #                                          None,
    #                                          token_set=doc_token_set)
    # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f]
    # tfidf = TfidfModel(corpus)
    # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300)
    glove_rankings = []
    # lsi_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    encoded_docs = torch.stack(
        [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f])
    encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1)
    for q, qml_ranking in progressbar(zip(tokenized_queries,
                                          ordered_rankings_to_eval),
                                      max_value=len(tokenized_queries)):
        doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None
        glove_rankings.append(
            rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids))
        # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids))
    print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k))
    print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))
Example #7
0
def prepare(lookup,
            title_to_id,
            token_lookup=None,
            num_tokens=None,
            token_set=None,
            drop_if_any_unk=False):
    id_to_title_lookup = _.invert(title_to_id)
    ids = range(len(id_to_title_lookup))
    contents = [lookup[id_to_title_lookup[id]] for id in ids]
    numericalized, token_lookup = preprocess_texts(
        contents,
        token_lookup=token_lookup,
        num_tokens=num_tokens,
        token_set=token_set,
        drop_if_any_unk=drop_if_any_unk)
    return numericalized, token_lookup
Example #8
0
 def __init__(self, cursor, lookups_path, train_size):
     self.cursor = cursor
     lookups = load_entity_candidate_ids_and_label_lookup(
         lookups_path, train_size)
     label_to_entity_id = _.invert(lookups['entity_labels'])
     self.entity_candidates_prior = {
         entity_text: {
             label_to_entity_id[label]: candidates
             for label, candidates in prior.items()
         }
         for entity_text, prior in
         lookups['entity_candidates_prior'].items()
     }
     self.prior_approx_mapping = u.get_prior_approx_mapping(
         self.entity_candidates_prior)
     self.mentions = None
     self.labels = None
     self.mention_doc_id = None
Example #9
0
def prepare_fs(lookup,
               title_to_id,
               token_lookup=None,
               token_set=None,
               num_tokens=None,
               drop_if_any_unk=False):
    id_to_title_lookup = _.invert(title_to_id)
    ids = range(len(id_to_title_lookup))
    contents = [lookup[id_to_title_lookup[id]] for id in ids]
    if num_tokens == -1: num_tokens = None
    numericalized, token_lookup = preprocess_texts(
        contents,
        token_lookup=token_lookup,
        token_set=token_set,
        num_tokens=num_tokens,
        drop_if_any_unk=drop_if_any_unk)
    numericalized_fs = [Counter(doc) for doc in numericalized]
    return numericalized_fs, token_lookup
Example #10
0
 def _get_tester(self, cursor, model):
     logits_and_softmax = self._get_logits_and_softmax()
     test_dataset = self._get_dataset(cursor, is_test=True)
     self._dataset = test_dataset
     batch_sampler = self._get_sampler(cursor, is_test=True)
     return Tester(
         dataset=test_dataset,
         batch_sampler=batch_sampler,
         model=model,
         logits_and_softmax=logits_and_softmax,
         embedding=self.lookups.embedding,
         token_idx_lookup=self.lookups.token_idx_lookup,
         device=self.device,
         experiment=self.experiment,
         ablation=self.model_params.ablation,
         use_adaptive_softmax=self.model_params.use_adaptive_softmax,
         use_wiki2vec=self.model_params.use_wiki2vec,
         use_sum_encoder=self.model_params.use_sum_encoder,
         label_to_entity_id=_.invert(self.lookups.entity_labels),
         use_stacker=self.model_params.use_stacker)
Example #11
0
def main():
    with open('./caches/pairwise_train_ranking_106756.json') as fh:
        query_ranking_pairs = json.load(fh)
    queries_by_tok_id, qml = zip(*query_ranking_pairs)
    parsed_queries, query_token_lookup = read_cache(
        './parsed_robust_queries_dict.json', lambda: print('failed'))
    inv = _.invert(query_token_lookup)
    queries = [
        ' '.join([inv[q] for q in query]) for query in queries_by_tok_id
    ]
    if len(sys.argv) > 1:
        lim = int(sys.argv[1])
    else:
        lim = None
    bm25_rankings, glove_rankings, rm3_rankings = get_other_results(
        queries[:lim], qml[:lim])
    agree_ctr, num_combos = check_overlap(qml[:lim], bm25_rankings)
    print(agree_ctr, num_combos, agree_ctr / num_combos)
    agree_ctr, num_combos = check_overlap(qml[:lim], glove_rankings)
    print(agree_ctr, num_combos, agree_ctr / num_combos)
    agree_ctr, num_combos = check_overlap(qml[:lim], rm3_rankings)
    print(agree_ctr, num_combos, agree_ctr / num_combos)
Example #12
0
def test_invert(case, expected):
    assert _.invert(case) == expected
Example #13
0
 def __init__(self,
              cursor,
              token_idx_lookup,
              full_token_idx_lookup,
              lookups_path,
              idf_path,
              train_size,
              txt_dataset_path,
              pkl_dataset_prefix=None):
     self.txt_dataset_path = txt_dataset_path
     self.pkl_dataset_prefix = pkl_dataset_prefix
     if self.pkl_dataset_prefix is not None:
         self.current_part = None
         return
     if self.txt_dataset_path is not None:
         if '.pkl' in self.txt_dataset_path:
             with open(self.txt_dataset_path, 'rb') as fh:
                 self.dataset_cache = pickle.load(fh)
                 return
         with open(self.txt_dataset_path) as fh:
             self.dataset_cache = [
                 ast.literal_eval(line) for line in fh.readlines()
             ]
             return
     with open(idf_path) as fh:
         self.idf = json.load(fh)
     self.cursor = cursor
     with open('./entity_to_row_id.pkl', 'rb') as fh:
         entity_id_to_row = pickle.load(fh)
     self.desc_fs = DocLookup('./desc_fs.npz',
                              entity_id_to_row,
                              token_idx_mapping=_.invert(token_idx_lookup),
                              default_value={},
                              use_default=True)
     self.desc_fs_unstemmed = DocLookup(
         './desc_unstemmed_fs.npz',
         entity_id_to_row,
         token_idx_mapping=_.invert(full_token_idx_lookup),
         default_value={'<PAD>': 1},
         use_default=True)
     self.embedding_dict = get_embedding_dict('./glove.6B.300d.txt',
                                              embedding_dim=300)
     self.stemmer = SnowballStemmer('english')
     lookups = load_entity_candidate_ids_and_label_lookup(
         lookups_path, train_size)
     label_to_entity_id = _.invert(lookups['entity_labels'])
     self.entity_candidates_prior = {
         entity_text: {
             label_to_entity_id[label]: candidates
             for label, candidates in prior.items()
         }
         for entity_text, prior in
         lookups['entity_candidates_prior'].items()
     }
     self.prior_approx_mapping = u.get_prior_approx_mapping(
         self.entity_candidates_prior)
     self.mentions = None
     self.labels = None
     self.mention_doc_id = None
     self.mention_sentences = None
     self.mention_fs = None
     self.mention_fs_unstemmed = None
     self.page_f_lookup = None
     self.with_labels = None
     self._candidate_strs_lookup = read_cache(
         './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor))
     self.stopwords = set(nltk_stopwords.words('english'))
Example #14
0
def main():
    p = get_cli_args(args)
    with open('./tokens.pkl', 'rb') as fh:
        token_idx_lookup = pickle.load(fh)
    load_dotenv(dotenv_path=p.run.env_path)
    EL_DATABASE_NAME = os.getenv("DBNAME")
    DATABASE_USER = os.getenv("DBUSER")
    DATABASE_PASSWORD = os.getenv("DBPASS")
    DATABASE_HOST = os.getenv("DBHOST")
    with open(p.train.page_id_order_path, 'rb') as fh:
        page_id_order = pickle.load(fh)
    page_ids = page_id_order[:p.train.num_pages_to_use]
    connection = pymysql.connect(host=DATABASE_HOST,
                                 user=DATABASE_USER,
                                 password=DATABASE_PASSWORD,
                                 db=EL_DATABASE_NAME,
                                 charset='utf8mb4',
                                 use_unicode=True,
                                 cursorclass=pymysql.cursors.DictCursor)
    with connection.cursor() as cursor:
        cursor.execute("SET NAMES utf8mb4;")
        cursor.execute("SET CHARACTER SET utf8mb4;")
        cursor.execute("SET character_set_connection=utf8mb4;")

        datasets = [
            MentionCoNLLDataset(cursor, './AIDA-YAGO2-dataset.tsv',
                                p.run.lookups_path, p.train.train_size),
            MentionWikiDataset(cursor, page_ids, p.run.lookups_path,
                               p.train.train_size)
        ]
        with open('./entity_to_row_id.pkl', 'rb') as fh:
            entity_id_to_row = pickle.load(fh)
        idf = get_idf(token_idx_lookup, p.run.idf_path)
        desc_fs_sparse = csr_matrix(load_npz('./desc_fs.npz'))
        desc_vs = csr_matrix(sparse_to_tfidf_vs(idf, desc_fs_sparse))
        norm = (desc_vs.multiply(desc_vs)).sum(1)
        all_e_id_pairs = set()
        data = []
        i = []
        j = []
        row_to_entity_id = _.invert(entity_id_to_row)
        for dataset in datasets:
            for cands in progressbar(iter(dataset)):
                if cands is None: continue
                cand_rows = [
                    entity_id_to_row[e_id] for e_id in cands
                    if (e_id in entity_id_to_row)
                ]
                cand_mat = desc_vs[cand_rows]
                scores = cand_mat.dot(cand_mat.T) / norm[cand_rows]
                new_i = cand_rows * len(cand_rows)
                new_j = [
                    row_num for row_num in cand_rows
                    for __ in range(len(cand_rows))
                ]
                list_scores = np.array(scores).ravel().tolist()
                for res_i in range(len(list_scores)):
                    pair = (row_to_entity_id[min(new_i[res_i], new_j[res_i])],
                            row_to_entity_id[max(new_i[res_i], new_j[res_i])])
                    if pair not in all_e_id_pairs:
                        data.append(list_scores[res_i])
                        i.append(new_i[res_i])
                        j.append(new_j[res_i])
        mat = csr_matrix(coo_matrix((data, (i, j))))
        train_str = 'wiki+conll_' + '_'.join([str(p.train.num_pages_to_use)])
        save_npz('compats_{}.npz'.format(train_str), mat)
Example #15
0
def test_invert_multivalue(case, expected):
    result = _.invert(case, multivalue=True)
    for key in result:
        assert set(result[key]) == set(expected[key])