def make_doc_query(claim_id, perspective_id, claim_text, p_text) -> DocQuery: query_id = "{}_{}".format(claim_id, perspective_id) raw_query_str: str = claim_text + " " + p_text q_terms: List[str] = clean_tokenize_str_to_tokens(raw_query_str) query = format_query_bm25(query_id, q_terms) return query
def make_query(split) -> Iterable[DocQuery]: max_q_terms = 15 k = 0.7 problems: List[ArguDataPoint] = load_problems(split) # split is only used for total number of documents bm25 = get_bm25_module(split) def get_query_for_problem(problem: ArguDataPoint) -> List[str]: terms = list(get_term_score_for_problem(problem)) _, last_term_score = terms[-1] q_terms = [] for term, score in terms: norm_score = int(score / last_term_score) for _ in range(norm_score): q_terms.append(term) return q_terms def get_term_score_for_problem( problem: ArguDataPoint) -> Iterable[Tuple[str, float]]: text = problem.text1.text sents = sent_tokenize_newline(text) importance = get_term_importance(bm25, sents) for term, score in importance.most_common(max_q_terms): yield term, score def get_problem_id(p: ArguDataPoint) -> str: return p.text1.id.id problem_ids: List[str] = lmap(get_problem_id, problems) q_terms_list: List[List[str]] = lmap(get_query_for_problem, problems) for q_id, q_terms in zip(problem_ids, q_terms_list): yield format_query_bm25(q_id, q_terms, k)
def get_claims_query(claims, drop_stopwords=False) -> List[DocQuery]: if drop_stopwords: stopword = load_stopwords() queries: List[DocQuery] = [] for c in claims: cid = str(c["cId"]) claim_text = c["text"] q_terms: List[str] = clean_tokenize_str_to_tokens(claim_text) print(q_terms) if drop_stopwords: q_terms = list([t for t in q_terms if t not in stopword]) q_terms = list([t.replace(".", "") for t in q_terms]) print(q_terms) q_entry: DocQuery = format_query_bm25(cid, q_terms) queries.append(q_entry) return queries
def transform(q: Query) -> Dict: tokens = word_tokenize(q.text) tokens = clean_query(tokens) return format_query_bm25(q.qid, tokens)
def get_query_entry_from_data_point(x: PerspectiveCandidate) -> DocQuery: tokens = clean_tokenize_str_to_tokens(x.claim_text + " " + x.p_text) qid = "{}_{}".format(x.cid, x.pid) return format_query_bm25(qid, tokens, k)