Exemple #1
0
def main():
    print("using second score as judgments")
    input_path = at_data_dir("clueweb", "2009.prels.1-50")

    raw_entries = []
    for line in open(input_path, "r"):
        query_id, doc_id, s1, s2, s3 = line.split()
        maybe_relevance = int(s1)
        maybe_relevance2 = int(s2)
        some_float = float(s3)
        e = TrecRelevanceJudgementEntry(query_id, doc_id, maybe_relevance2)
        raw_entries.append(e)

    save_path = at_data_dir("clueweb", "2009.qrel_test.2.txt")
    write_trec_relevance_judgement(raw_entries, save_path)
Exemple #2
0
def main():
    todo: List[Tuple[QueryID, MSMarcoDoc]] = get_todo()
    msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv")
    passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path)

    try:
        passage_dict = load_from_pickle("msmarco_passage_doc_analyze_passage_dict")
    except FileNotFoundError:
        passage_dict = load_passage_dict(todo, passage_qrels)
    doc_queries = dict(load_train_queries())

    itr: Iterable[Tuple[str, MSMarcoDoc, JoinedPassage]] = join_doc_passage(todo, passage_qrels, passage_dict)
    ##
    for qid, doc, passage in itr:
        query_text = doc_queries[QueryID(qid)]
        print('query', qid, query_text)
        prev = doc.body[:passage.loc]
        passage_text = passage.text
        tail = doc.body[passage.loc + len(passage_text):]
        print("-----")
        print(prev)
        print(">>>")
        print(passage_text)
        print("<<<")
        print(tail)
        print("-----")
Exemple #3
0
def main():
    pc_data: List[Dict] = load_claim_perspective_pair()

    pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True)
    gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds()

    out_f = open(at_data_dir("perspective", "claims_and_perspective.txt"), "w", encoding="utf-8")

    for e in pc_data:
        cid = e['cId']

        if not gold_d[cid]:
            continue
        text = e['text']
        rows = []
        row = [str(cid), text]
        rows.append(row)

        for pc in gold_d[cid]:
            rows.append([pc.stance_label_3, pc.stance_label_5])
            for pid in pc.perspective_ids:
                row = [perspective_getter(pid)]
                rows.append(row)
            rows.append([])

        for row in rows:
            out_f.write("\t".join(row) + "\n")
        out_f.write("\n\n\n")
Exemple #4
0
def get_passage_dict(passage_ids_to_find):
    msmarco_passage_corpus_path = at_data_dir("msmarco", "collection.tsv")
    passage_dict = {}
    with open(msmarco_passage_corpus_path, 'r', encoding='utf8') as f:
        for line in f:
            passage_id, text = line.split("\t")
            if passage_id in passage_ids_to_find:
                passage_dict[passage_id] = text
    return passage_dict
Exemple #5
0
def main():
    pc_data: List[Dict] = load_claim_perspective_pair()

    out_f = open(at_data_dir("perspective", "claims.txt"), "w")

    for e in pc_data:
        cid = e['cId']
        text = e['text']
        row = [str(cid), text]
        out_f.write("\t".join(row) + "\n")
Exemple #6
0
def measure_msmarco_passage():
    msmarco_passage_corpus_path = at_data_dir("msmarco", "collection.tsv")
    passage_dict = {}
    l_list = []
    with open(msmarco_passage_corpus_path, 'r', encoding='utf8') as f:
        for line in f:
            passage_id, text = line.split("\t")
            tokens = text.split()
            l_list.append(len(tokens))

            if len(l_list) > 10000:
                break

    print(average(l_list))
Exemple #7
0
def load_queries(year_list: Iterable[int]) -> List[TrecQuery]:
    all_queries = []
    for year in year_list:
        query_path = at_data_dir("clueweb", "{}.topics.xml".format(year))
        xml = load_xml(query_path)
        root_tag = xml.tag
        assert str(year) in root_tag
        for idx, topic in enumerate(xml):
            qid = topic.attrib['number']
            query_type = topic.attrib['type']
            keyword_query = topic.find('query').text
            desc_query = topic.find('description').text
            query = TrecQuery(qid, query_type, keyword_query, desc_query)
            all_queries.append(query)

    return all_queries
Exemple #8
0
def main():
    data_name = sys.argv[1]
    gold = load(at_data_dir("genex", "{}_gold.txt".format(data_name)), 999)
    run1 = load(sys.argv[2], 3)

    def common(pred, gold):
        return list([t for t in pred if t in gold])

    d1 = NamedAverager()

    for idx, (t1, t_gold) in enumerate(zip(run1, gold)):
        c1 = common(t1, t_gold)
        p1 = len(c1) / len(t1)
        r1 = len(c1) / len(t_gold)
        f1 = get_f1(p1, r1)
        d1['prec'].append(p1)
        d1['recall'].append(r1)
        d1['f1'].append(f1)

    print(d1.get_average_dict())
def main():
    split = "train"
    resource = ProcessedResource10docMulti(split)

    query_group: List[List[QueryID]] = load_query_group(split)
    msmarco_passage_qrel_path = at_data_dir("msmarco", "qrels.train.tsv")
    passage_qrels: QRelsDict = load_qrels_structured(msmarco_passage_qrel_path)

    qids = query_group[0]
    qids = qids[:100]
    pickle_name = "msmarco_passage_doc_analyze_passage_dict_evidence_loc"
    try:
        passage_dict = load_from_pickle(pickle_name)
    except FileNotFoundError:
        print("Reading passages...")
        passage_dict = get_passages(qids, passage_qrels)
        save_to_pickle(passage_dict, pickle_name)
    def get_rel_doc_id(qid):
        if qid not in resource.get_doc_for_query_d():
            raise KeyError
        for doc_id in resource.get_doc_for_query_d()[qid]:
            label = resource.get_label(qid, doc_id)
            if label:
                return doc_id
        raise KeyError

    def translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body):
        acc = 0
        for idx, tokens in enumerate(stemmed_body_tokens_list):
            acc += len(tokens)
            if loc_in_body < acc:
                return idx
        return -1

    pc_tokenize = PCTokenizer()
    bert_tokenizer = get_tokenizer()

    for qid in qids:
        try:
            doc_id = get_rel_doc_id(qid)
            stemmed_tokens_d = resource.get_stemmed_tokens_d(qid)
            stemmed_title_tokens, stemmed_body_tokens_list = stemmed_tokens_d[doc_id]
            rel_passages = list([passage_id for passage_id, score in passage_qrels[qid].items() if score])
            success = False
            found_idx = -1
            for rel_passage_id in rel_passages:
                passage_text = passage_dict[rel_passage_id].strip()
                passage_tokens = pc_tokenize.tokenize_stem(passage_text)
                stemmed_body_tokens_flat = lflatten(stemmed_body_tokens_list)
                n, log = lcs(passage_tokens, stemmed_body_tokens_flat, True)
                if len(passage_tokens) > 4 and n > len(passage_tokens) * 0.7 and n > 0:
                    success = True
                    _, loc_in_body = log[0]

                    sent_idx = translate_token_idx_to_sent_idx(stemmed_body_tokens_list, loc_in_body)
                    prev = stemmed_body_tokens_flat[:loc_in_body]

                    loc_by_bert_tokenize = len(bert_tokenizer.tokenize(" ".join(prev)))
                    print(sent_idx, loc_in_body, loc_by_bert_tokenize, len(stemmed_body_tokens_list))
                    found_idx = sent_idx
            if not success:
                print("Not found. doc_lines={} passage_len={}".format(len(stemmed_body_tokens_list), len(passage_tokens)))

        except KeyError:
            pass
Exemple #10
0
def load_qrels_for(year) -> QRelsDict:
    qrel_path = at_data_dir("clueweb", "{}.qrels.txt".format(year))
    return load_qrels_structured(qrel_path)
Exemple #11
0
def main():
    # run1 = load(at_output_dir("genex", "textrank.txt"), 3)
    # run2 = load(at_output_dir("genex", "textrank-ts.txt"), 3)
    problems = load_as_tokens("tdlt")
    run1 = load(at_output_dir("genex", "1"), 3)
    run2 = load(at_output_dir("genex", "2_ts"), 3)
    gold = load(at_data_dir("genex", "tdlt_gold.txt"), 999)

    def common(pred, gold):
        return list([t for t in pred if t in gold])

    n_correct_1 = 0
    n_correct_2 = 0

    d1 = NamedAverager()
    d2 = NamedAverager()

    for idx, (t1, t2, t_gold,
              problem) in enumerate(zip(run1, run2, gold, problems)):
        c1 = common(t1, t_gold)
        c2 = common(t2, t_gold)

        p1 = len(c1) / len(t1)
        r1 = len(c1) / len(t_gold)
        f1 = get_f1(p1, r1)
        d1['prec'].append(p1)
        d1['recall'].append(r1)
        d1['f1'].append(f1)

        p2 = len(c2) / len(t2)
        r2 = len(c2) / len(t_gold)
        f2 = get_f1(p2, r2)
        d2['prec'].append(p2)
        d2['recall'].append(r2)
        d2['f1'].append(f2)

        n_correct_1 += len(c1)
        n_correct_2 += len(c2)

        if len(c1) != len(c2):
            print()
            print(">> Problem ", idx)
            print("textrank :", c1)
            print("textrank-ts :", c2)

            q_match = len(common(problem.query, problem.doc))
            n_q = len(problem.query)
            if len(c1) < len(c2):
                d2['q_match_rate'].append(q_match / n_q)
            else:
                d1['q_match_rate'].append(q_match / n_q)
            print('query: ', problem.query)
            print("matching query terms: ", common(problem.query, problem.doc))
            print('doc: ', " ".join(problem.doc))

    print("{} vs {}".format(n_correct_1, n_correct_2))

    print(d1.get_average_dict())
    print(d2.get_average_dict())

    print(d1.avg_dict['q_match_rate'].history)
    print(d2.avg_dict['q_match_rate'].history)