def main(): clean_doc_sample: Dict[str, Tuple] = load_from_pickle( "clean_clueweb_doc_sample") doc_json_list = lmap( json.loads, open(at_output_dir("clueweb", "doc_content_samples.json"), "r")) d = difflib.Differ() html_diff = difflib.HtmlDiff() clean_fn = text_from_html for doc_json in doc_json_list[1:]: html = doc_json['content'] open(at_output_dir("visualize", "text.html"), "w", encoding="utf-8").write(html) doc_id = doc_json['id'] title, cleaned_text_ref = clean_doc_sample[doc_id] cleaned_text = clean_fn(html) # print(cleaned_text_ref) # print(cleaned_text) tokens_ref = nltk.tokenize.wordpunct_tokenize(cleaned_text_ref) tokens = nltk.tokenize.wordpunct_tokenize(cleaned_text) print(" ".join(tokens_ref)) print(" ".join(tokens)) diff = d.compare(tokens_ref, tokens) break
def main(): html_template_path = os.path.join(data_path, "med_contradiction", "annotation", "annotation_template.html") input_csv_path = at_output_dir("alamri_pilot", "pilot_pairs.csv") html_out_dir = at_output_dir("alamri_pilot", "pilot_pairs_html") csv_link_output = at_output_dir("alamri_pilot", "pilot_links.csv") generate(input_csv_path, html_template_path, html_out_dir, csv_link_output)
def main(): work(range(2009, 2013), KEYWORD_QUERY, at_output_dir("clueweb", "keyword_09b_query.json")) work(range(2009, 2013), DESC_QUERY, at_output_dir("clueweb", "desc_09b_query.json")) work(range(2013, 2015), KEYWORD_QUERY, at_output_dir("clueweb", "keyword_12b_query.json")) work(range(2013, 2015), DESC_QUERY, at_output_dir("clueweb", "desc_12b_query.json"))
def generate_and_write(file_name, generate_fn, tokenizer): data_id_man = DataIDManager() inst_list = generate_fn(data_id_man) max_seq_length = 300 save_path = at_output_dir("alamri_tfrecord", file_name) encode_fn = get_encode_fn(max_seq_length, tokenizer) write_records_w_encode_fn(save_path, encode_fn, inst_list) info_save_path = at_output_dir("alamri_tfrecord", file_name + ".info") json.dump(data_id_man.id_to_info, open(info_save_path, "w"))
def show_high(): info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") info = json.load(open(info_save_path, "r")) # prediction_file = at_output_dir("clue_counter_arg", "ada_aawd4_clue.4000.score") prediction_file = at_output_dir("clue_counter_arg", "ada_aawd5_clue.4000.score") pred_data = join_prediction_with_info(prediction_file, info) for idx, e in enumerate(pred_data): score = logit_to_score_softmax(e['logits']) if int(score * 100) == 13: print(e['text'])
def get_f5_tids_score_d_from_bert(): info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") info = json.load(open(info_save_path, "r")) prediction_file = at_output_dir("clue_counter_arg", "ada_aawd5_clue.4000.score") pred_data = join_prediction_with_info(prediction_file, info) score_d = {} for idx, e in enumerate(pred_data): score = logit_to_score_softmax(e['logits']) text = e['text'] score_d[text] = score return score_d
def __init__(self, queries, qid_list, probe_config): self.long_seg_score_path_format = at_output_dir("rqd", "rqd_{}.score") self.short_seg_score_path_format = at_output_dir("rqd", "rqd_sm_{}.score") info_file_path = at_output_dir("robust", "seg_info") f_handler = get_format_handler("qc") self.f_handler = f_handler self.info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) self.doc_piece_score_d: Dict[Tuple[str, str], List[ScoredPieceFromPair]] = {} self.prepared_qids = set() self.probe_config = probe_config self.queries = queries self.tokenizer = get_tokenizer() self.qid_list: List[str] = qid_list self.not_found_cnt = 0
def main(): num_layers = 12 dva = DictValueAverage() all_val = defaultdict(list) for i in range(1): save_path = at_output_dir("lms_scores", str(i) + ".pickle") output_d = load_pickle_from(save_path) input_mask = output_d['input_mask'] # [num_inst, seq_length] for layer_no in range(num_layers): probs = sigmoid( output_d['logits'][layer_no]) # [num_inst, seq_length, 2] num_inst, seq_length, maybe_2 = np.shape(probs) for data_idx in range(num_inst): for seq_idx in range(seq_length): if input_mask[data_idx, seq_idx]: key = layer_no v = probs[data_idx, seq_idx, 1] dva.add(key, v) all_val[key].append(v) for k, v in dva.all_average().items(): print(k, v) for k, l in all_val.items(): min_val = max(l) print(k, min_val)
def main(): saved_dir = at_output_dir("perspective_experiments", "clueweb_qres") path1 = os.path.join(saved_dir, "train.txt") path2 = os.path.join(saved_dir, "dev.txt") rlg1 = load_ranked_list_grouped(path1) rlg2 = load_ranked_list_grouped(path2) k = 10 most_common = [] for query_id1 in rlg1: for query_id2 in rlg2: top_k_docs1 = lmap(TrecRankedListEntry.get_doc_id, rlg1[query_id1][:k]) top_k_docs2 = lmap(TrecRankedListEntry.get_doc_id, rlg2[query_id2][:k]) common = set(top_k_docs1).intersection(top_k_docs2) percent_common = len(common) / k if percent_common > 0.1: most_common.append((percent_common, query_id1, query_id2)) most_common.sort(key=get_first, reverse=True) for rate_common, qid1, qid2 in most_common[:10]: print(rate_common, qid1, qid2)
def main(): pc_data: List[Dict] = load_claim_perspective_pair() pc_data.sort(key=lambda e: len(e['perspectives']), reverse=True) gold_d: Dict[int, List[PerspectiveCluster]] = load_perspectrum_golds() ca_cid = 1 out_j = [] for e in pc_data[:100]: cid = e['cId'] if not gold_d[cid]: continue c_text = e['text'] for pc in gold_d[cid]: if random.random() < 0.3: first_pid = pc.perspective_ids[0] p_text = perspective_getter(first_pid) j_entry = { 'cid': cid, 'claim_text': c_text, 'ca_cid': ca_cid, 'perspective': { 'stance': pc.stance_label_3, 'pid': first_pid, 'p_text': p_text } } ca_cid += 1 out_j.append(j_entry) print("total of {}".format(len(out_j))) out_f = open(at_output_dir("ca_building", "claims.step1.txt"), "w", encoding="utf-8") json.dump(out_j, out_f, indent=True)
def make_tfrecord(source_name, target_name): source_data = data_d[source_name] target_data = data_d[target_name] combined_data = combine_source_and_target(source_data, target_data, 1) save_path = at_output_dir( dir_name, "{}_to_{}_train".format(source_name, target_name)) write_records_w_encode_fn(save_path, encode_fn, combined_data)
def do_predict( bert_hp, train_config, data, lms_config, modeling_option, init_fn, ): num_gpu = train_config.num_gpu train_batches, dev_batches = data lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu) sess = init_session() sess.run(tf.global_variables_initializer()) init_fn(sess) step_size = 100 for i in range(100): st = i * step_size ed = st + step_size # make explain train_op does not increase global step tprint(st, ed) output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits, lms_model.loss_tensor, lms_model.ex_score_tensor, lms_model.per_layer_logit_tensor, lms_model.batch2feed_dict) save_path = at_output_dir("lms_scores", str(i)) save_to_pickle(output_d, save_path)
def main(): data_id_manager = DataIDManager() data = [] for text in enum_f5_data(): info = { 'text': text, } data_id = data_id_manager.assign(info) label = 0 data.append(TextInstance(text, label, data_id)) encode_fn = get_encode_fn_w_data_id(512, False) save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord") write_records_w_encode_fn(save_path, encode_fn, data) info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") json.dump(data_id_manager.id_to_info, open(info_save_path, "w"))
def main(): exist_or_mkdir(os.path.join(output_path, "aawd_tfrecord")) train, dev, test = load_aawd_splits() todo = [(train, "train"), (dev, "dev"), (test, "test")] encode_fn = get_encode_fn(256) for data, split in todo: save_path = at_output_dir("aawd_tfrecord", split) write_records_w_encode_fn(save_path, encode_fn, data)
def num_files_to_touch(): doc_id_list = readlines_strip( at_output_dir("clueweb", "not_found.sort.txt")) grouped = group_by(doc_id_list, get_doc_group) dir_helper = get_sydney_clueweb09_corpus_helper() for group_id, doc_ids in grouped.items(): num_files = dir_helper.iter_gz_files_for_group(group_id) print(len(doc_ids), len(num_files))
def aawd_pred_histogram(): prediction_file = at_output_dir("clue_counter_arg", "ada_argu3_aawd_20000.score") prediction_file = at_output_dir("clue_counter_arg", "ada_aawd5_clue.4000.score") pred_data = EstimatorPredictionViewer(prediction_file) def bin_fn(score): return str(int(score * 1000)) bin = BinHistogram(bin_fn) for idx, e in enumerate(pred_data): score = logit_to_score_softmax(e.get_vector('logits')) bin.add(score) for i in range(101): key = str(i) if key in bin.counter: print(key, bin.counter[key])
def main(): info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") info = json.load(open(info_save_path, "r")) prediction_file = at_output_dir("clue_counter_arg", "ada_aawd4_clue.4000.score") pred_data = join_prediction_with_info(prediction_file, info) def bin_fn(score): return str(int(score * 100)) bin = BinHistogram(bin_fn) for idx, e in enumerate(pred_data): score = logit_to_score_softmax(e['logits']) bin.add(score) for i in range(101): key = str(i) if key in bin.counter: print(key, bin.counter[key])
def binary_gen(): exist_or_mkdir(os.path.join(output_path, "argu_ana_tfrecord")) train_x, train_y, dev_x, dev_y = get_argu_pointwise_data() train = zip(train_x, train_y) dev = zip(dev_x, dev_y) todo = [(train, "train"), (dev, "dev")] encode_fn = get_encode_fn(512) for data, split in todo: save_path = at_output_dir("argu_ana_tfrecord", split) write_records_w_encode_fn(save_path, encode_fn, data)
def main(): target_data_idx = int(sys.argv[1]) info_dir = "/mnt/disks/disk100/data_info/robust_w_data_id_desc_info_pickle/" max_seq_length = 512 score_and_save_dir = [] base_model_name = "robust_3A" for split_idx in range(5): for repeat_idx in range(5): if target_data_idx == split_idx: pass else: score_dir_name = "seg_score_{}_{}_{}".format( base_model_name, split_idx, repeat_idx) score_dir_path = at_output_dir("robust", score_dir_name) save_dir_path = at_output_dir("robust_seg_sel", score_dir_name) score_and_save_dir.append((score_dir_path, save_dir_path)) generate_selected_training_data_for_many_runs( target_data_idx, info_dir, max_seq_length, score_and_save_dir, generate_selected_training_data)
def main(): exist_or_mkdir(os.path.join(output_path, "alamri_tfrecord")) data_id_manager = DataIDManager() entries = [] for claim1, claim2 in enum_true_instance(): entries.append((claim1.text, claim2.text)) save_path = at_output_dir("alamri_pilot", "true_pairs_all.csv") csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, entries)
def main(): f = open(at_output_dir("clueweb", "doc_ids_sample.txt"), "r") doc_ids = list([l.strip() for l in f]) doc_contents = read_doc_id_title_text() new_d = {} for doc_id in doc_ids: t = doc_contents[doc_id] new_d[doc_id] = t save_to_pickle(new_d, "clean_clueweb_doc_sample")
def main(): save_path = at_output_dir("alamri_pilot", "pilot_pairs.csv") entries = [] for claim1, claim2 in enum_true_instance(3): print("--") print("{}".format(claim1.text)) print("{}".format(claim2.text)) entries.append((claim1.text, claim2.text)) csv_writer = csv.writer(open(save_path, "w", newline='', encoding="utf-8")) foreach(csv_writer.writerow, entries)
def main(): query_type = "desc" queries = load_robust_04_query(query_type) qid_list = get_robust_qid_list() tokenizer = get_tokenizer() f = open(at_output_dir("robust", "desc_query_len.txt"), "w") for qid in qid_list: query = queries[str(qid)] query_tokens = tokenizer.tokenize(query) n_terms = len(query_tokens) f.write("{}\n".format(n_terms)) f.close()
def main(): target_data_idx = int(sys.argv[1]) max_seq_length = int(sys.argv[2]) max_seg = int(sys.argv[3]) info_path = os.path.join(job_man_dir, "robust_w_data_id_desc_info_pickle", "{}".format(target_data_idx)) info = load_pickle_from(info_path) save_dir_path = at_output_dir("robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg)) exist_or_mkdir(save_dir_path) get_score_fn = get_score_fn_functor() generate_selected_training_data(info, max_seq_length, save_dir_path, get_score_fn, max_seg)
def print_top_k(): k = 30 info_save_path = at_output_dir("clue_counter_arg", "clue_f5.tfrecord.info") info = json.load(open(info_save_path, "r")) prediction_file = at_output_dir("clue_counter_arg", "ada_aawd5_clue.4000.score") pred_data = join_prediction_with_info(prediction_file, info) simple_data = [] text_set = set() for idx, e in enumerate(pred_data): score = logit_to_score_softmax(e['logits']) text = e['text'] if text in text_set: continue text_set.add(text) simple_data.append((text, score)) simple_data.sort(key=get_second, reverse=True) for text, score in simple_data[:k]: tab_print(score * 100, text)
def main(): args = arg_parser.parse_args(sys.argv[1:]) target_data_idx = int(args.target_data) max_seq_length = int(args.max_seq_length) max_seg = int(args.max_seg) info_path = args.info_path info = load_combine_info_jsons(info_path) save_dir_path = at_output_dir( "robust_seg_sel", "exact_match{}_{}".format(max_seq_length, max_seg)) exist_or_mkdir(save_dir_path) get_score_fn = get_score_fn_functor() generate_selected_training_data_w_json(info, max_seq_length, save_dir_path, get_score_fn, max_seg)
def main(): cr = CluewebReranking(list(range(2010, 2013))) all_docs = read_doc_id_title_text() f = open(at_output_dir("clueweb", "not_found.txt"), "w") not_found = 0 for qid in cr.qrels.keys(): for doc_id in cr.get_docs_for_training(qid): if doc_id not in all_docs: not_found += 1 f.write("{}\n".format(doc_id)) print("not found", not_found)
def main(): doc_id_list = readlines_strip( at_output_dir("clueweb", "not_found.sort.txt")) grouped = group_by(doc_id_list, get_doc_group) todo: List[Tuple[str, List]] = list(grouped.items()) todo.sort(key=get_first) num_jobs = len(grouped) def worker_factory(out_dir): return GetDocWorker(todo, out_dir) print("num jobs", num_jobs) runner = JobRunner(job_man_dir, num_jobs - 1, "get_missing_clueweb09_docs", worker_factory) runner.start()
def read_doc_id_title_text(): doc_id_title_text = at_output_dir("clueweb", "doc_id_title_text.txt") all_doc_id = [] out_d = {} for line in open(doc_id_title_text, "r"): first_sep = line.find("[SEP]") second_sep = line.find("[SEP]", first_sep+1) sep_len = len("[SEP]") doc_id = line[:first_sep] title = line[first_sep+sep_len:second_sep] content = line[second_sep+sep_len:] all_doc_id.append(doc_id) out_d[doc_id] = title, content print("num unique docs:", len(set(all_doc_id))) return out_d
def main(): split = "dev" query_d = dict(load_queries(split)) bm25_module = get_bm25_module() ranked_list_path = at_working_dir("msmarco-doc{}-top100".format(split)) run_name = "BM25_df100" rlg = load_ranked_list_grouped(ranked_list_path) save_path = at_output_dir("ranked_list", "mmd_dev_{}.txt".format(run_name)) te = TimeEstimator(100) out_entries = [] for query_id, entries in rlg.items(): doc_ids = list([e.doc_id for e in entries]) docs = load_per_query_docs(query_id, None) found_doc_ids = list([d.doc_id for d in docs]) not_found_doc_ids = list( [doc_id for doc_id in doc_ids if doc_id not in found_doc_ids]) doc_id_len = len(not_found_doc_ids) if doc_id_len: print("{} docs not found".format(doc_id_len)) query_text = query_d[QueryID(query_id)] def score(doc: MSMarcoDoc): content = doc.title + " " + doc.body return bm25_module.score(query_text, content) scored_docs = list([(d, score(d)) for d in docs]) scored_docs.sort(key=get_second, reverse=True) reranked_entries = [] for rank, (doc, score) in enumerate(scored_docs): e = TrecRankedListEntry(query_id, doc.doc_id, rank, score, run_name) reranked_entries.append(e) out_entries.extend(reranked_entries) te.tick() if len(out_entries) > 100 * 100: break write_trec_ranked_list_entry(out_entries, save_path)