def select_question_types(sample_probs, selected_answer, num_sample_style=2, max_sample_times=20): (answer_text, char_st, char_ed, st, ed, answer_bio_ids, answer_pos_tag, answer_ner_tag) = selected_answer a_tag = "-".join([answer_pos_tag, answer_ner_tag]) # get s probs styles = QUESTION_TYPES # question types s_probs = [] for s in QUESTION_TYPES: s_condition = "_".join([s, a_tag]) if s_condition in sample_probs["s|c,a"]: s_probs.append(sample_probs["s|c,a"][s_condition]) else: s_probs.append(1) # sample s sampled_styles = [] sample_times = 0 for sample_times in range(max_sample_times): sampled_s = weighted_sample(styles, s_probs) if sampled_s not in sampled_styles: sampled_styles.append(sampled_s) if len(sampled_styles) >= num_sample_style: break return sampled_styles
def select_clues(chunklist, doc, sample_probs, selected_answer, num_sample_clue=2, clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20, max_sample_times=20): (answer_text, char_st, char_ed, st, ed, answer_bio_ids, answer_pos_tag, answer_ner_tag) = selected_answer doc_token_list = [token for token in doc] # doc is sentence_doc idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list) # get c probs c_probs = [] for chunk in chunklist: chunk_pos_tag = chunk[1] chunk_ner_tag = chunk[0] c_tag = "-".join([chunk_pos_tag, chunk_ner_tag]) answer_start = st clue_start = chunk[3] clue_end = chunk[4] clue_answer_dep_path_len = abs(clue_start - answer_start) answer_related = idx2related[answer_start] for tk_id, path in answer_related: if tk_id == clue_start: clue_answer_dep_path_len = len(path) dep_dist = clue_answer_dep_path_len dep_dist_bin = val2bin(dep_dist, clue_dep_dist_min_val, clue_dep_dist_max_val, clue_dep_dist_bin_width) c_condition = "_".join([c_tag, str(dep_dist_bin)]) # condition of p(c|...) if c_condition in sample_probs["c|a"] and chunk[2][0].lower() not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE: c_probs.append(sample_probs["c|a"][c_condition]) else: c_probs.append(1) # sample c chunk_ids = list(range(len(chunklist))) sampled_clue_chunk_ids = [] sample_times = 0 for sample_times in range(max_sample_times): sampled_chunk_id = weighted_sample(chunk_ids, c_probs) if sampled_chunk_id not in sampled_clue_chunk_ids: sampled_clue_chunk_ids.append(sampled_chunk_id) if len(sampled_clue_chunk_ids) >= num_sample_clue: break sampled_clues = [] for chunk_id in sampled_clue_chunk_ids: chunk = chunklist[chunk_id] clue_start = chunk[3] clue_end = chunk[4] clue_text = ' '.join(context_tokens[clue_start:clue_end + 1]) clue_binary_ids = [0] * len(doc_token_list) clue_binary_ids[clue_start:clue_end + 1] = [1] * (clue_end - clue_start + 1) clue = {"clue_text": clue_text, "clue_binary_ids": clue_binary_ids} sampled_clues.append(clue) return sampled_clues
def select_answers(sentence, sample_probs, num_sample_answer=5, answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30, max_sample_times=20, original_answer=None): # get all chunks chunklist, tree, doc = get_chunks(sentence) token2idx, idx2token = get_token2char(doc) # If the original answer was provided, choose a chunk of text the most similar to it if original_answer is not None: return select_most_similar_answer( sample_probs, chunklist, tree, doc, token2idx, idx2token, sentence, original_answer ) # sample answer chunk chunk_ids = list(range(len(chunklist))) a_probs = compute_answers_probs( sample_probs, chunklist, answer_length_bin_width, answer_length_min_val, answer_length_max_val ) sampled_answer_chunk_ids = [] # sample_times = 0 for sample_times in range(max_sample_times): sampled_chunk_id = weighted_sample(chunk_ids, a_probs) if sampled_chunk_id not in sampled_answer_chunk_ids: sampled_answer_chunk_ids.append(sampled_chunk_id) if len(sampled_answer_chunk_ids) >= num_sample_answer: break sampled_answers = [] for chunk_id in sampled_answer_chunk_ids: chunk = chunklist[chunk_id] chunk_ner_tag, chunk_pos_tag, leaves, st, ed = chunk try: context = sentence char_st, char_ed = str_find(context, leaves) if char_st < 0: continue answer_text = context[char_st:char_ed + 1] st = idx2token[char_st] ed = idx2token[char_ed] answer_bio_ids = ['O'] * len(doc) answer_bio_ids[st: ed + 1] = ['I'] * (ed - st + 1) answer_bio_ids[st] = 'B' char_st = token2idx[st][0] char_ed = token2idx[ed][1] sampled_answers.append( (answer_text, char_st, char_ed, st, ed, answer_bio_ids, chunk_pos_tag, chunk_ner_tag)) except: continue return sampled_answers, chunklist, tree, doc
print(example) print("... above tests get_answer_clue_style_info\n") # test get_sample_probs filename = "../../../../../Datasets/original/SQuAD1.1-Zhou/train.txt" filetype = "squad" save_dataset_info_file = "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_ans_clue_style_info.pkl" save_sample_probs_file = "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_sample_probs.pkl" sample_probs = get_sample_probs( filename, filetype, save_dataset_info_file, save_sample_probs_file, sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20, debug=True, debug_length=20, answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30, clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20) print("... above tests get_sample_probs\n") # NOTICE: we have run for non debug mode on squad. The files are saved to: # "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_ans_clue_style_info_full_train.pkl" # "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_sample_probs_full_train.pkl" # use these files to plot figures. # test weighted_sample result = [weighted_sample(["a", "b"], [0.5, 0.5]) for _ in range(30)] print(result) print("... above tests weighted_sample\n") # test augment_qg_data result = augment_qg_data("Bob is eating a delicious cake in Vancouver.", sample_probs) print("sampled result is: ") print(result) print("... above tests augment_qg_data\n")
def select_answers(sentence, sample_probs, num_sample_answer=5, answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30, max_sample_times=20, original_answer=None): # get all chunks chunklist, tree, doc = get_chunks(sentence) token2idx, idx2token = get_token2char(doc) if original_answer is not None: return select_most_similar_answer(chunklist, tree, doc, token2idx, idx2token, sentence, original_answer) # sample answer chunk chunk_ids = list(range(len(chunklist))) a_probs = [] for chunk in chunklist: chunk_pos_tag = chunk[1] chunk_ner_tag = chunk[0] a_tag = "-".join([chunk_pos_tag, chunk_ner_tag]) a_length = abs(chunk[3] - chunk[4] + 1) a_length_bin = val2bin(a_length, answer_length_min_val, answer_length_max_val, answer_length_bin_width) a_condition = "_".join([a_tag, str(a_length_bin)]) # condition of p(a|...) if a_condition in sample_probs["a"] and chunk[2][0].lower( ) not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE: a_probs.append(sample_probs["a"][a_condition]) else: a_probs.append(1) sampled_answer_chunk_ids = [] sample_times = 0 for sample_times in range(max_sample_times): sampled_chunk_id = weighted_sample(chunk_ids, a_probs) if sampled_chunk_id not in sampled_answer_chunk_ids: sampled_answer_chunk_ids.append(sampled_chunk_id) if len(sampled_answer_chunk_ids) >= num_sample_answer: break sampled_answers = [] for chunk_id in sampled_answer_chunk_ids: chunk = chunklist[chunk_id] chunk_ner_tag, chunk_pos_tag, leaves, st, ed = chunk try: context = sentence char_st, char_ed = str_find(context, leaves) if char_st < 0: continue answer_text = context[char_st:char_ed + 1] st = idx2token[char_st] ed = idx2token[char_ed] answer_bio_ids = ['O'] * len(doc) answer_bio_ids[st:ed + 1] = ['I'] * (ed - st + 1) answer_bio_ids[st] = 'B' char_st = token2idx[st][0] char_ed = token2idx[ed][1] sampled_answers.append( (answer_text, char_st, char_ed, st, ed, answer_bio_ids, chunk_pos_tag, chunk_ner_tag)) except: continue return sampled_answers, chunklist, tree, doc