def proc_ans_tokens(self, ans, token_to_ix_ans, max_token): ans_ix = np.zeros(max_token, np.int64) if (self.__C.AUGMENTED_ANSWER): question = ans['question'] augmented_ans = re.sub( r"([.,'!?\"()*#:;])", '', question.lower() ).replace('-', ' ').replace('/', ' ').replace(ans['question_type'], ans['multiple_choice_answer']) words = prep_ans(augmented_ans).split() else : words = prep_ans(ans['multiple_choice_answer']).split() for ix, word in enumerate(words): if word in token_to_ix_ans: ans_ix[ix] = token_to_ix_ans[word] else: ans_ix[ix] = token_to_ix_ans['UNK'] if ix + 1 == max_token: break return ans_ix
def ans_stat(self, stat_ans_list, ans_freq): ans_to_ix = {} ix_to_ans = {} ans_freq_dict = {} ''' ans = { "image_id": 393714, "question_id": 393714000, "question_type": "what color are the", "answer_type": "other", "multiple_choice_answer": "white", "answers": [ {"answer": "light blue", "answer_confidence": "yes", "answer_id": 1}, {"answer": "white", "answer_confidence": "yes", "answer_id": 2}, {"answer": "white", "answer_confidence": "yes", "answer_id": 3}, {"answer": "white", "answer_confidence": "yes", "answer_id": 4}, {"answer": "white", "answer_confidence": "yes", "answer_id": 5}, {"answer": "white", "answer_confidence": "maybe", "answer_id": 6}, {"answer": "white", "answer_confidence": "yes", "answer_id": 7}, {"answer": "white", "answer_confidence": "yes", "answer_id": 8}, {"answer": "white", "answer_confidence": "yes", "answer_id": 9}, {"answer": "white", "answer_confidence": "yes", "answer_id": 10} ] } ''' for ans in stat_ans_list: ''' To get all unique answer for each question ''' answers = ans['answers'] unprep_ans = set([i['answer'] for i in answers]) all_answers = [prep_ans(i) for i in unprep_ans] ans_proc = prep_ans(ans['multiple_choice_answer']) ''' ans_proc is a string and may have multiple words ''' if ans_proc not in ans_freq_dict: ans_freq_dict[ans_proc] = 1 else: ans_freq_dict[ans_proc] += 1 ans_freq_filter = ans_freq_dict.copy() count_rejected = 0 for ans in ans_freq_dict: if ans_freq_dict[ans] <= ans_freq: count_rejected += ans_freq_dict[ans] ans_freq_filter.pop(ans) print(' ========== Unanswerable questions:', count_rejected) for ans in ans_freq_filter: ix_to_ans[ans_to_ix.__len__()] = ans ans_to_ix[ans] = ans_to_ix.__len__() return ans_to_ix, ix_to_ans
def proc_ans(self, ans, ans_to_ix): ans_score = np.zeros(ans_to_ix.__len__(), np.float32) ans_emb = None flag = False # indicates whether the embeddings of most probable answer is already made ans_prob_dict = {} for ans_ in ans['answers']: ans_proc = prep_ans(ans_['answer']) if ans_proc not in ans_prob_dict: ans_prob_dict[ans_proc] = 1 else: ans_prob_dict[ans_proc] += 1 ''' Here we create the embedding of the most probable answer, in future we can also take the weighted average of embeddings of all the possible answers. ''' if self.__C.LOSS_FUNC in ['kld']: for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = ans_prob_dict[ans_] / 10. else: for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = self.get_score(ans_prob_dict[ans_]) return ans_score
def tokenize_ans(self, ans_list, use_glove): token_to_ix_ans = { 'PAD': 0, 'UNK': 1, 'CLS': 2, } pretrained_emb_ans = [] if use_glove: pretrained_emb_ans.append(self.spacy_tool('PAD').vector) pretrained_emb_ans.append(self.spacy_tool('UNK').vector) pretrained_emb_ans.append(self.spacy_tool('CLS').vector) for ans in ans_list: #Taking out the multiple choice/ most voted ans of the question multi_choice_ans = ans['multiple_choice_answer'] multi_choice_ans = prep_ans(multi_choice_ans) words = multi_choice_ans.split() for word in words: if word not in token_to_ix_ans: token_to_ix_ans[word] = len(token_to_ix_ans) if use_glove: pretrained_emb_ans.append(self.spacy_tool(word).vector) pretrained_emb_ans = np.array(pretrained_emb_ans) return token_to_ix_ans, pretrained_emb_ans
def ans_stat(self, stat_ans_dict): ans_to_ix = {} ix_to_ans = {} for qid in stat_ans_dict: ans = stat_ans_dict[qid]['answer'] ans = prep_ans(ans) if ans not in ans_to_ix: ix_to_ans[ans_to_ix.__len__()] = ans ans_to_ix[ans] = ans_to_ix.__len__() return ans_to_ix, ix_to_ans
def proc_ans_tokens(self, ans, token_to_ix_ans, max_token): ans_ix = np.zeros(max_token, np.int64) words = prep_ans(ans['multiple_choice_answer']).split() for ix, word in enumerate(words): if word in token_to_ix_ans: ans_ix[ix] = token_to_ix_ans[word] else: ans_ix[ix] = token_to_ix_ans['UNK'] if ix + 1 == max_token: break return ans_ix
def proc_ans(self, ans, ans_to_ix): ans_score = np.zeros(ans_to_ix.__len__(), np.float32) ans_prob_dict = {} for ans_ in ans['answers']: ans_proc = prep_ans(ans_['answer']) if ans_proc not in ans_prob_dict: ans_prob_dict[ans_proc] = 1 else: ans_prob_dict[ans_proc] += 1 for ans_ in ans_prob_dict: if ans_ in ans_to_ix: ans_score[ans_to_ix[ans_]] = self.get_score(ans_prob_dict[ans_]) return ans_score
def ans_stat(stat_ans_list): ans_to_ix = {} ix_to_ans = {} ans_freq_dict = {} for ans in stat_ans_list: ans_proc = prep_ans(ans['multiple_choice_answer']) if ans_proc not in ans_freq_dict: ans_freq_dict[ans_proc] = 1 else: ans_freq_dict[ans_proc] += 1 ans_freq_filter = ans_freq_dict.copy() for ans in ans_freq_dict: if ans_freq_dict[ans] <= 8: ans_freq_filter.pop(ans) for ans in ans_freq_filter: ix_to_ans[ans_to_ix.__len__()] = ans ans_to_ix[ans] = ans_to_ix.__len__() return ans_to_ix, ix_to_ans
def proc_ans(self, ans, ans_to_ix): ans_ix = np.zeros(1, np.int64) ans = prep_ans(ans) ans_ix[0] = ans_to_ix[ans] return ans_ix