def main(): pc_clusters: Iterable[PerspectiveCluster] = enum_perspective_clusters() tokenizer = TokenizerForGalago() def get_terms(text: str) -> Counter: terms = tokenizer.tokenize(text) return Counter(terms) # Query = [claim :: avg(perspective)] claim_text_d: Dict[int, str] = get_all_claim_d() perspective_text_d: Dict[int, str] = get_perspective_dict() def cluster_to_query(cluster: PerspectiveCluster) -> DocQuery: claim_text = claim_text_d[cluster.claim_id] perspective_text_list = list( [perspective_text_d[pid] for pid in cluster.perspective_ids]) query_id = get_pc_cluster_query_id(cluster) claim_tf: Counter = get_terms(claim_text) pers_tf: Counter = average_counters( lmap(get_terms, perspective_text_list)) tf = sum_counters([claim_tf, pers_tf]) query: DocQuery = counter_to_galago_query(query_id, tf) return query query_list: List[DocQuery] = lmap(cluster_to_query, pc_clusters) print(len(query_list)) out_path = os.path.join(output_path, "perspective_query", "pc_query_for_evidence.json") save_queries_to_file(query_list, out_path)
def get_valid_terms(): perspective = get_perspective_dict() tokenizer = PCTokenizer() voca = set() for text in perspective.values(): voca.update(tokenizer.tokenize_stem(text)) return voca
def __init__(self): claim_and_perspective = load_claim_perspective_pair() perspective = get_perspective_dict() all_sents = [] for e in claim_and_perspective: claim_text = e['text'] all_sents.append(claim_text) for pid, text in perspective.items(): all_sents.append(text) print("tokenizing {} docs".format(len(all_sents))) token_docs = [] for s in all_sents: tokens = nltk.sent_tokenize(s) token_docs.append(tokens) print("get_idf") idf = inverse_document_frequencies(token_docs) tfidf_documents = [] print("sublinear tf") for document in token_docs: doc_tfidf = [] for term in idf.keys(): tf = sublinear_term_frequency(term, document) doc_tfidf.append(tf * idf[term]) tfidf_documents.append(doc_tfidf) self.d = {} for sent, tfidf_val in zip(all_sents, tfidf_documents): self.d[sent] = tfidf_val
def main(): d: Dict[str, str] = get_perspective_dict() save_path = os.path.join(output_path, "perspective", "corpus.xml") f = open(save_path, "w") for pid, text in d.items(): lines = trec_writer(pid, text) f.writelines(lines) f.close()
def test_es(): claim_and_perspective = load_claim_perspective_pair() perspective = get_perspective_dict() for e in claim_and_perspective: claim_text = e['text'] for perspective_cluster in e['perspectives']: pids = perspective_cluster['pids'] for pid in pids: query = claim_text + " " + perspective[pid] es_helper.get_perspective_from_pool(query, 50)
def show_num_mention(): train, val = load_feature_and_split() p_dict = get_perspective_dict() claims = get_claims_from_ids(lmap(lambda x: x['cid'], train)) claim_d = claims_to_dict(claims) grouped = group_by(train, lambda x: x['cid']) for cid in grouped: print("Claim:", claim_d[cid]) for dp in grouped[cid]: p_text = p_dict[dp['pid']] print(dp['label'], get_num_mention(dp), p_text)
def get_candidates(claims, balance) -> List[PerspectiveCandidate]: related_p_map = get_claim_perspective_id_dict() related_p_map = { key: flatten(value) for key, value in related_p_map.items() } p_map = get_perspective_dict() all_data_points = [] for c in claims: cid = c["cId"] claim_text = c["text"] lucene_results = es_helper.get_perspective_from_pool(claim_text, 50) rp = related_p_map[cid] pid_set = list([_pid for _text, _pid, _score in lucene_results]) data_point_list = [] for pid in pid_set: p_text = p_map[pid] label = 1 if pid in rp else 0 data_point = PerspectiveCandidate(label=str(label), cid=cid, pid=pid, claim_text=claim_text, p_text=p_text) #data_point = [str(label), str(cid), str(pid), claim_text, p_text] data_point_list.append(data_point) # If training, we balance positive and negative examples. if balance: pos_insts = list([e for e in data_point_list if e.label == "1"]) neg_insts = list([e for e in data_point_list if e.label == "0"]) neg_insts = neg_insts[:len(pos_insts)] data_point_list = pos_insts + neg_insts all_data_points.extend(data_point_list) return all_data_points
def perspective_getter(pid): global perspective if perspective is None: perspective = get_perspective_dict() return perspective[pid]