def ans_stat(stat_ans_list): ans_to_ix = {} ix_to_ans = {} ans_freq_dict = {} # 统计每个答案出现的次数 for ans in stat_ans_list: ans_proc = prep_ans(ans['multiple_choice_answer']) if ans_proc not in ans_freq_dict: ans_freq_dict[ans_proc] = 1 else: ans_freq_dict[ans_proc] += 1 # 如果某个答案出现次数<=8就pop掉 ans_freq_filter = ans_freq_dict.copy() for ans in ans_freq_dict: if ans_freq_dict[ans] <= 8: ans_freq_filter.pop(ans) # ix_to_ans[0] = 'net' # ans_to_ix['net'] = 0 for ans in ans_freq_filter: ix_to_ans[ans_to_ix.__len__()] = ans ans_to_ix[ans] = ans_to_ix.__len__() return ans_to_ix, ix_to_ans
def get_top_answers(examples, occurs=0): """ Extract all of correct answers in the dataset. Build a set of possible answers which appear more than pre-defined "occurs" times. -------------------- Arguments: examples (list): the json data loaded from disk. occurs (int): a threshold that determine which answers are kept. Return: vocab_ans (list): a set of correct answers in the dataset. """ counter = Counter() for ex in examples: for ans in ex["mc_ans"]: ans = str(ans).lower() ans_proc = prep_ans(ans) counter.update([ans_proc]) frequent_answers = list(filter(lambda x: x[1] > occurs, counter.items())) total_ans = sum(item[1] for item in counter.items()) total_freq_ans = sum(item[1] for item in frequent_answers) print("Number of unique answers:", len(counter)) print("Total number of answers:", total_ans) print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) print("Sample frequent answers:") print("\n".join(map(str, frequent_answers[:20]))) vocab_ans = [] for item in frequent_answers: vocab_ans.append(item[0]) return vocab_ans
def proc_ans_oe(ques, ans_to_ix): answers = ques['ans_score'] ans_score = np.zeros(ans_to_ix.__len__(), np.float32) for ans in answers: ans_proc = prep_ans(ans[0]) ans_score[ans_to_ix[ans_proc]] = ans[1] return ans_score
def filter_answers(examples, ans2idx): """ Remove the answers that don't appear in our answer set. -------------------- Arguments: examples (list): the json data that contains all of answers in the dataset. ans2idx (dict): a set of considered answers. Return: examples (list): the processed json data which contains only answers in the answer set. """ for ex in examples: ex["ans_score"] = [list(filter(lambda x: prep_ans(x[0]) in ans2idx, answers)) for answers in ex["ans_score"]] return examples
def proc_ans(ans, ans_to_ix): ans_score = np.zeros(ans_to_ix.__len__(), np.float32) ans_prob_dict = {} for ans_ in ans['answers']: ans_proc = prep_ans(ans_['answer']) if ans_proc not in ans_prob_dict: ans_prob_dict[ans_proc] = 1 else: ans_prob_dict[ans_proc] += 1 for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_]) return ans_score
def proc_ans_and_abs(self, ans): ans_to_ix = self.ans_to_ix abs_to_ix = self.abs_to_ix ans_score = np.zeros(ans_to_ix.__len__(), np.float32) abs_score = np.zeros(abs_to_ix.__len__(), np.float32) ans_group = np.zeros(ans_to_ix.__len__(), np.bool) abs_group = np.zeros(abs_to_ix.__len__(), np.bool) ans_prob_dict = {} # process ans for ans_ in ans['answers']: ans_proc = prep_ans(ans_['answer']) if ans_proc not in ans_prob_dict: ans_prob_dict[ans_proc] = 1 else: ans_prob_dict[ans_proc] += 1 for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_]) # process abstraction ans_appear_most = sorted(ans_prob_dict.items(), key=lambda x: -1 * x[1])[0][0] if ans_appear_most in ans_to_ix: # from top to down abspath = self.ans_to_abspath[ans_appear_most] # Select groups for computing losses if len(abspath) != 0: for abs_ in abspath[1:]: abs_score[abs_to_ix[abs_]] = 1.0 for x in abspath: children = self.abs_tree[x] if children[0] in ans_to_ix: ids = [ans_to_ix[a] for a in children] ans_group[ids] = True else: ids = [abs_to_ix[a] for a in children] abs_group[ids] = True return ans_score, abs_score, (abs_group, ans_group) return ans_score, abs_score,\ (np.zeros(abs_to_ix.__len__(), np.bool),\ np.ones(ans_to_ix.__len__(), np.bool))
def proc_ans_mc(ques, ans_to_ix, token_to_ix): answers = ques['ans_score'] mcs = ques['mc'] ans_gt = ques['mc_ans'] ans_label = np.zeros(18, np.float32) ans_mc_ix = np.zeros((18, 4), np.int) for index in range(len(mcs)): if mcs[index].lower() == ans_gt.lower(): ans_label[index] = 1.0 words = re.sub( r"([.,'!?\"()*#:;])", '', mcs[index].lower() ).replace('-', ' ').replace('/', ' ').split() for ix, word in enumerate(words): if word in token_to_ix: ans_mc_ix[ix] = token_to_ix[word] else: ans_mc_ix[ix] = token_to_ix['UNK'] if ix + 1 == 4: break assert sum(ans_label) == 1.0 ans_ix = np.zeros(18, np.int)-1 for index in range(len(mcs)): try: ans_ix[index] = ans_to_ix[mcs[index]] except: pass ans_score = np.zeros(ans_to_ix.__len__(), np.float32) for ans in answers: ans_proc = prep_ans(ans[0]) ans_score[ans_to_ix[ans_proc]] = ans[1] return ans_score, ans_label, ans_mc_ix, ans_ix
def proc_ans(ans, ans_to_ix): ''' :param ans: 输入的答案 :param ans_to_ix: {"ans":'ix'} 共3129个词,答案出现频率高的前3129个, :return: ans_core:答案的分数 ''' ans_score = np.zeros(ans_to_ix.__len__(), np.float32) ans_prob_dict = {} #答案单词字典 for ans_ in ans['answers']: ans_proc = prep_ans(ans_['answer']) if ans_proc not in ans_prob_dict: ans_prob_dict[ans_proc] = 1 else: ans_prob_dict[ans_proc] += 1 for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_]) return ans_score
def ans_stat(stat_ans_list): ans_to_ix = {} ix_to_ans = {} ans_freq_dict = {} for ans in stat_ans_list: ans_proc = prep_ans(ans['multiple_choice_answer']) if ans_proc not in ans_freq_dict: ans_freq_dict[ans_proc] = 1 else: ans_freq_dict[ans_proc] += 1 ans_freq_filter = ans_freq_dict.copy() for ans in ans_freq_dict: if ans_freq_dict[ans] <= 8: ans_freq_filter.pop(ans) for ans in ans_freq_filter: ix_to_ans[ans_to_ix.__len__()] = ans ans_to_ix[ans] = ans_to_ix.__len__() return ans_to_ix, ix_to_ans