Example #1
0
def make_doc_query(claim_id, perspective_id, claim_text, p_text) -> DocQuery:
    query_id = "{}_{}".format(claim_id, perspective_id)
    raw_query_str: str = claim_text + " " + p_text
    q_terms: List[str] = clean_tokenize_str_to_tokens(raw_query_str)

    query = format_query_bm25(query_id, q_terms)
    return query
Example #2
0
def make_query(split) -> Iterable[DocQuery]:
    max_q_terms = 15
    k = 0.7
    problems: List[ArguDataPoint] = load_problems(split)
    # split is only used for total number of documents
    bm25 = get_bm25_module(split)

    def get_query_for_problem(problem: ArguDataPoint) -> List[str]:
        terms = list(get_term_score_for_problem(problem))
        _, last_term_score = terms[-1]

        q_terms = []
        for term, score in terms:
            norm_score = int(score / last_term_score)
            for _ in range(norm_score):
                q_terms.append(term)
        return q_terms

    def get_term_score_for_problem(
            problem: ArguDataPoint) -> Iterable[Tuple[str, float]]:
        text = problem.text1.text
        sents = sent_tokenize_newline(text)
        importance = get_term_importance(bm25, sents)
        for term, score in importance.most_common(max_q_terms):
            yield term, score

    def get_problem_id(p: ArguDataPoint) -> str:
        return p.text1.id.id

    problem_ids: List[str] = lmap(get_problem_id, problems)
    q_terms_list: List[List[str]] = lmap(get_query_for_problem, problems)
    for q_id, q_terms in zip(problem_ids, q_terms_list):
        yield format_query_bm25(q_id, q_terms, k)
Example #3
0
def get_claims_query(claims, drop_stopwords=False) -> List[DocQuery]:
    if drop_stopwords:
        stopword = load_stopwords()

    queries: List[DocQuery] = []
    for c in claims:
        cid = str(c["cId"])
        claim_text = c["text"]
        q_terms: List[str] = clean_tokenize_str_to_tokens(claim_text)
        print(q_terms)
        if drop_stopwords:
            q_terms = list([t for t in q_terms if t not in stopword])
        q_terms = list([t.replace(".", "") for t in q_terms])
        print(q_terms)

        q_entry: DocQuery = format_query_bm25(cid, q_terms)
        queries.append(q_entry)
    return queries
Example #4
0
 def transform(q: Query) -> Dict:
     tokens = word_tokenize(q.text)
     tokens = clean_query(tokens)
     return format_query_bm25(q.qid, tokens)
Example #5
0
 def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery:
     tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text)
     qid = "{}_{}".format(x.cid, x.pid)
     return format_query_bm25(qid, tokens, k)