Ejemplo n.º 1
0
def evaluate_treccar(model_path, test_art_qrels, test_top_qrels,
                     test_hier_qrels, test_paratext, level):
    test_page_paras, test_rev_para_top, test_rev_para_hier = get_trec_dat(
        test_art_qrels, test_top_qrels, test_hier_qrels)
    test_len_paras = np.array(
        [len(test_page_paras[page]) for page in test_page_paras.keys()])
    print('test mean paras: %.2f, std: %.2f, max paras: %.2f' %
          (np.mean(test_len_paras), np.std(test_len_paras),
           np.max(test_len_paras)))
    test_ptext_dict = get_paratext_dict(test_paratext)
    test_top_cluster_data = []
    test_hier_cluster_data = []
    max_num_doc_test = max(
        [len(test_page_paras[p]) for p in test_page_paras.keys()])
    test_pages = list(test_page_paras.keys())
    for i in trange(len(test_pages)):
        page = test_pages[i]
        paras = test_page_paras[page]
        paratexts = [test_ptext_dict[p] for p in paras]
        top_sections = list(set([test_rev_para_top[p] for p in paras]))
        top_labels = [top_sections.index(test_rev_para_top[p]) for p in paras]
        hier_sections = list(set([test_rev_para_hier[p] for p in paras]))
        hier_labels = [
            hier_sections.index(test_rev_para_hier[p]) for p in paras
        ]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        n = len(paras)
        paras = paras + ['dummy'] * (max_num_doc_test - n)
        paratexts = paratexts + [''] * (max_num_doc_test - n)
        top_labels = top_labels + [-1] * (max_num_doc_test - n)
        hier_labels = hier_labels + [-1] * (max_num_doc_test - n)
        test_top_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(top_labels)))
        test_hier_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(hier_labels)))
    print("Top-level datasets")
    print("Test instances: %5d" % len(test_top_cluster_data))

    model = SentenceTransformer(model_path)
    if level == 'h':
        print('Evaluating hiererchical clusters')
        test_evaluator = ClusterEvaluator.from_input_examples(
            test_hier_cluster_data)
        model.evaluate(test_evaluator)
    else:
        print('Evaluating toplevel clusters')
        test_evaluator = ClusterEvaluator.from_input_examples(
            test_top_cluster_data)
        model.evaluate(test_evaluator)
Ejemplo n.º 2
0
def save_squt_dataset(train_pages_file, art_qrels, top_qrels, paratext, val_samples, outdir):
    page_paras, rev_para_top, _ = get_trec_dat(art_qrels, top_qrels, None)
    ptext_dict = get_paratext_dict(paratext)
    train_cluster_data = []
    test_cluster_data = []
    pages = []
    with open(train_pages_file, 'r') as f:
        for l in f:
            pages.append(l.rstrip('\n'))
    for i in trange(len(pages)):
        page = pages[i]
        paras = page_paras[page]
        page_sec_para_dict = {}
        for p in paras:
            sec = rev_para_top[p]
            if sec not in page_sec_para_dict.keys():
                page_sec_para_dict[sec] = [p]
            else:
                page_sec_para_dict[sec].append(p)
        sections = list(set([rev_para_top[p] for p in paras]))
        random.shuffle(sections)
        test_sections, train_sections = sections[:len(sections)//2], sections[len(sections)//2:]
        train_paras = []
        test_paras = []
        for s in test_sections:
            test_paras += page_sec_para_dict[s]
        for s in train_sections:
            train_paras += page_sec_para_dict[s]
        test_labels = [sections.index(rev_para_top[p]) for p in test_paras]
        train_labels = [sections.index(rev_para_top[p]) for p in train_paras]
        test_paratexts = [ptext_dict[p] for p in test_paras]
        train_paratexts = [ptext_dict[p] for p in train_paras]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        test_cluster_data.append(InputTRECCARExample(qid=page, q_context=query_text, pids=test_paras,
                                                     texts=test_paratexts, label=np.array(test_labels)))
        train_cluster_data.append(InputTRECCARExample(qid=page, q_context=query_text, pids=train_paras,
                                                      texts=train_paratexts, label=np.array(train_labels)))
    random.shuffle(test_cluster_data)
    val_cluster_data = test_cluster_data[:val_samples]
    test_cluster_data = test_cluster_data[val_samples:]
    with open(outdir + '/squt_treccar_train.pkl', 'wb') as f:
        pickle.dump(train_cluster_data, f)
    with open(outdir + '/squt_treccar_val.pkl', 'wb') as f:
        pickle.dump(val_cluster_data, f)
    with open(outdir + '/squt_treccar_test.pkl', 'wb') as f:
        pickle.dump(test_cluster_data, f)
    print(
        'No. of data instances - Train: %5d, Val: %5d, Test: %5d' % (len(train_cluster_data), len(val_cluster_data),
                                                                     len(test_cluster_data)))
Ejemplo n.º 3
0
def prepare_cluster_data_for_eval(art_qrels, top_qrels, paratext, do_filter, val_samples):
    page_paras, rev_para_top, _ = get_trec_dat(art_qrels, top_qrels, None)
    len_paras = np.array([len(page_paras[page]) for page in page_paras.keys()])
    print('mean paras: %.2f, std: %.2f, max paras: %.2f' % (np.mean(len_paras), np.std(len_paras), np.max(len_paras)))
    ptext_dict = get_paratext_dict(paratext)
    top_cluster_data = []
    pages = list(page_paras.keys())
    skipped_pages = 0
    max_num_doc = max([len(page_paras[p]) for p in page_paras.keys()])
    for i in trange(len(pages)):
        page = pages[i]
        paras = page_paras[page]
        paratexts = [ptext_dict[p] for p in paras]
        top_sections = list(set([rev_para_top[p] for p in paras]))
        top_labels = [top_sections.index(rev_para_top[p]) for p in paras]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        n = len(paras)
        if do_filter:
            if n < 20 or n > 200:
                skipped_pages += 1
                continue
        paras = paras[:max_num_doc] if n >= max_num_doc else paras + ['dummy'] * (max_num_doc - n)
        paratexts = paratexts[:max_num_doc] if n >= max_num_doc else paratexts + [''] * (max_num_doc - n)
        top_labels = top_labels[:max_num_doc] if n >= max_num_doc else top_labels + [-1] * (max_num_doc - n)
        if do_filter:
            if len(set(top_labels)) < 2 or n / len(set(top_labels)) < 2.5:
                ## the page should have at least 2 top level sections and n/k should be at least 2.5
                skipped_pages += 1
                continue
        top_cluster_data.append(InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts,
                                                          label=np.array(top_labels)))
    if val_samples > 0:
        top_cluster_data = top_cluster_data[:val_samples]
    print('Total data instances: %5d' % len(top_cluster_data))
    return top_cluster_data
Ejemplo n.º 4
0
def prepare_cluster_data_train_only(art_qrels, top_qrels, hier_qrels, paratext,
                                    num_doc):
    page_paras, rev_para_top, rev_para_hier = get_trec_dat(
        art_qrels, top_qrels, hier_qrels)
    ptext_dict = get_paratext_dict(paratext)
    top_cluster_data = []
    hier_cluster_data = []
    pages = list(page_paras.keys())
    for i in trange(len(pages)):
        page = pages[i]
        paras = page_paras[page]
        if len(paras) != num_doc:
            continue
        paratexts = [ptext_dict[p] for p in paras]
        top_sections = list(set([rev_para_top[p] for p in paras]))
        if len(top_sections) < 2:
            continue
        top_labels = [top_sections.index(rev_para_top[p]) for p in paras]
        hier_sections = list(set([rev_para_hier[p] for p in paras]))
        hier_labels = [hier_sections.index(rev_para_hier[p]) for p in paras]
        assert len(paratexts) == num_doc
        assert len(top_labels) == num_doc
        assert len(hier_labels) == num_doc
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        top_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(top_labels)))
        hier_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(hier_labels)))
    print('Total data instances: %5d' % len(top_cluster_data))
    return top_cluster_data, hier_cluster_data
Ejemplo n.º 5
0
def prepare_cluster_data_train(pages_file, art_qrels, top_qrels, paratext):
    page_paras, rev_para_top, _ = get_trec_dat(art_qrels, top_qrels, None)
    ptext_dict = get_paratext_dict(paratext)
    top_cluster_data = []
    pages = []
    with open(pages_file, 'r') as f:
        for l in f:
            pages.append(l.rstrip('\n'))
    for i in trange(len(pages)):
        page = pages[i]
        paras = page_paras[page]
        paratexts = [ptext_dict[p] for p in paras]
        top_sections = list(set([rev_para_top[p] for p in paras]))
        if len(top_sections) < 2:
            continue
        top_labels = [top_sections.index(rev_para_top[p]) for p in paras]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        top_cluster_data.append(InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts,
                                                    label=np.array(top_labels)))
    print('Total data instances: %5d' % len(top_cluster_data))
    return top_cluster_data
Ejemplo n.º 6
0
def prepare_cluster_data(train_art_qrels,
                         train_top_qrels,
                         train_hier_qrels,
                         train_paratext,
                         test_art_qrels,
                         test_top_qrels,
                         test_hier_qrels,
                         test_paratext,
                         max_num_doc=75,
                         val_samples=50):
    train_page_paras, train_rev_para_top, train_rev_para_hier = get_trec_dat(
        train_art_qrels, train_top_qrels, train_hier_qrels)
    test_page_paras, test_rev_para_top, test_rev_para_hier = get_trec_dat(
        test_art_qrels, test_top_qrels, test_hier_qrels)
    train_len_paras = np.array(
        [len(train_page_paras[page]) for page in train_page_paras.keys()])
    test_len_paras = np.array(
        [len(test_page_paras[page]) for page in test_page_paras.keys()])
    print(
        'train mean paras: %.2f, std: %.2f, max paras: %.2f, test mean paras: %.2f, std: %.2f, max paras: %.2f'
        % (np.mean(train_len_paras), np.std(train_len_paras),
           np.max(train_len_paras), np.mean(test_len_paras),
           np.std(test_len_paras), np.max(test_len_paras)))
    train_ptext_dict = get_paratext_dict(train_paratext)
    test_ptext_dict = get_paratext_dict(test_paratext)
    train_top_cluster_data = []
    train_hier_cluster_data = []
    train_pages = list(train_page_paras.keys())
    skipped_pages = 0
    for i in trange(len(train_pages)):
        page = train_pages[i]
        paras = train_page_paras[page]
        paratexts = [train_ptext_dict[p] for p in paras]
        top_sections = list(set([train_rev_para_top[p] for p in paras]))
        top_labels = [top_sections.index(train_rev_para_top[p]) for p in paras]
        hier_sections = list(set([train_rev_para_hier[p] for p in paras]))
        hier_labels = [
            hier_sections.index(train_rev_para_hier[p]) for p in paras
        ]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        n = len(paras)
        if n < 20 or n > 200:  ## at least 20 and at max 200 passages should be contained in the page
            skipped_pages += 1
            continue
        paras = paras[:max_num_doc] if n >= max_num_doc else paras + [
            'dummy'
        ] * (max_num_doc - n)
        paratexts = paratexts[:
                              max_num_doc] if n >= max_num_doc else paratexts + [
                                  ''
                              ] * (max_num_doc - n)
        top_labels = top_labels[:
                                max_num_doc] if n >= max_num_doc else top_labels + [
                                    -1
                                ] * (max_num_doc - n)
        hier_labels = hier_labels[:
                                  max_num_doc] if n >= max_num_doc else hier_labels + [
                                      -1
                                  ] * (max_num_doc - n)
        if len(set(top_labels)) < 2 or n / len(set(top_labels)) < 2.5:
            ## the page should have at least 2 top level sections and n/k should be at least 2.5
            skipped_pages += 1
            continue
        train_top_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(top_labels)))
        train_hier_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(hier_labels)))
    val_top_cluster_data = train_top_cluster_data[-val_samples:]
    train_top_cluster_data = train_top_cluster_data[:-val_samples]
    val_hier_cluster_data = train_hier_cluster_data[-val_samples:]
    train_hier_cluster_data = train_hier_cluster_data[:-val_samples]
    test_top_cluster_data = []
    test_hier_cluster_data = []
    max_num_doc_test = max(
        [len(test_page_paras[p]) for p in test_page_paras.keys()])
    test_pages = list(test_page_paras.keys())
    for i in trange(len(test_pages)):
        page = test_pages[i]
        paras = test_page_paras[page]
        paratexts = [test_ptext_dict[p] for p in paras]
        top_sections = list(set([test_rev_para_top[p] for p in paras]))
        top_labels = [top_sections.index(test_rev_para_top[p]) for p in paras]
        hier_sections = list(set([test_rev_para_hier[p] for p in paras]))
        hier_labels = [
            hier_sections.index(test_rev_para_hier[p]) for p in paras
        ]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        n = len(paras)
        paras = paras + ['dummy'] * (max_num_doc_test - n)
        paratexts = paratexts + [''] * (max_num_doc_test - n)
        top_labels = top_labels + [-1] * (max_num_doc_test - n)
        hier_labels = hier_labels + [-1] * (max_num_doc_test - n)
        test_top_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(top_labels)))
        test_hier_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(hier_labels)))
    print("Top-level datasets")
    print("Train instances: %5d" % len(train_top_cluster_data))
    print("Val instances: %5d" % len(val_top_cluster_data))
    print("Test instances: %5d" % len(test_top_cluster_data))
    print("Skipped pages: %5d" % skipped_pages)

    return train_top_cluster_data, train_hier_cluster_data, val_top_cluster_data, val_hier_cluster_data, \
           test_top_cluster_data, test_hier_cluster_data