Esempio n. 1
0
    def proc_ans_tokens(self, ans, token_to_ix_ans, max_token):
        ans_ix = np.zeros(max_token, np.int64)

        if (self.__C.AUGMENTED_ANSWER):
            question = ans['question']
            augmented_ans = re.sub(
                    r"([.,'!?\"()*#:;])",
                    '',
                    question.lower()
                ).replace('-', ' ').replace('/', ' ').replace(ans['question_type'], ans['multiple_choice_answer'])
            words = prep_ans(augmented_ans).split()

        else :
            words = prep_ans(ans['multiple_choice_answer']).split()

        for ix, word in enumerate(words):
            if word in token_to_ix_ans:
                ans_ix[ix] = token_to_ix_ans[word]
            else:
                ans_ix[ix] = token_to_ix_ans['UNK']

            if ix + 1 == max_token:
                break

        return ans_ix
Esempio n. 2
0
    def ans_stat(self, stat_ans_list, ans_freq):
        ans_to_ix = {}
        ix_to_ans = {}
        ans_freq_dict = {}
        '''
        ans = {
        "image_id": 393714, 
        "question_id": 393714000,
        "question_type": "what color are the",
        "answer_type": "other", 
        "multiple_choice_answer": "white", 
        "answers": [
            {"answer": "light blue", "answer_confidence": "yes", "answer_id": 1}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 2}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 3}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 4}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 5}, 
            {"answer": "white", "answer_confidence": "maybe", "answer_id": 6}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 7}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 8}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 9}, 
            {"answer": "white", "answer_confidence": "yes", "answer_id": 10}
            ]
        }
        '''

        for ans in stat_ans_list:
            '''
            To get all unique answer for each question
            '''
            answers = ans['answers']
            unprep_ans = set([i['answer'] for i in answers])
            all_answers = [prep_ans(i) for i in unprep_ans]

            ans_proc = prep_ans(ans['multiple_choice_answer'])
            '''
            ans_proc is a string and may have multiple words
            '''

            if ans_proc not in ans_freq_dict:
                ans_freq_dict[ans_proc] = 1
            else:
                ans_freq_dict[ans_proc] += 1

        ans_freq_filter = ans_freq_dict.copy()
        count_rejected = 0
        for ans in ans_freq_dict:
            if ans_freq_dict[ans] <= ans_freq:
                count_rejected += ans_freq_dict[ans]
                ans_freq_filter.pop(ans)

        print(' ========== Unanswerable questions:', count_rejected)

        for ans in ans_freq_filter:
            ix_to_ans[ans_to_ix.__len__()] = ans
            ans_to_ix[ans] = ans_to_ix.__len__()

        return ans_to_ix, ix_to_ans
Esempio n. 3
0
    def proc_ans(self, ans, ans_to_ix):
        ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
        ans_emb = None
        flag = False # indicates whether the embeddings of most probable answer is already made
        ans_prob_dict = {}

        for ans_ in ans['answers']:
            ans_proc = prep_ans(ans_['answer'])
            if ans_proc not in ans_prob_dict:
                ans_prob_dict[ans_proc] = 1
            else:
                ans_prob_dict[ans_proc] += 1

            '''
            Here we create the embedding of the most probable answer,
            in future we can also take the weighted average of embeddings
            of all the possible answers.
            '''

        if self.__C.LOSS_FUNC in ['kld']:
            for ans_ in ans_prob_dict:
                if ans_ in ans_to_ix:
                    ans_score[ans_to_ix[ans_]] = ans_prob_dict[ans_] / 10.
        else:
            for ans_ in ans_prob_dict:
                if ans_ in ans_to_ix:
                    ans_score[ans_to_ix[ans_]] = self.get_score(ans_prob_dict[ans_])

        return ans_score
Esempio n. 4
0
    def tokenize_ans(self, ans_list, use_glove):

        token_to_ix_ans = {
            'PAD': 0,
            'UNK': 1,
            'CLS': 2,
        }

        pretrained_emb_ans = []
        if use_glove:

            pretrained_emb_ans.append(self.spacy_tool('PAD').vector)
            pretrained_emb_ans.append(self.spacy_tool('UNK').vector)
            pretrained_emb_ans.append(self.spacy_tool('CLS').vector)

        for ans in ans_list:
            #Taking out the multiple choice/ most voted ans of the question
            multi_choice_ans = ans['multiple_choice_answer']

            multi_choice_ans = prep_ans(multi_choice_ans)

            words = multi_choice_ans.split()

            for word in words:
                if word not in token_to_ix_ans:
                    token_to_ix_ans[word] = len(token_to_ix_ans)
                    if use_glove:
                        pretrained_emb_ans.append(self.spacy_tool(word).vector)

        pretrained_emb_ans = np.array(pretrained_emb_ans)

        return token_to_ix_ans, pretrained_emb_ans
Esempio n. 5
0
    def ans_stat(self, stat_ans_dict):
        ans_to_ix = {}
        ix_to_ans = {}

        for qid in stat_ans_dict:
            ans = stat_ans_dict[qid]['answer']
            ans = prep_ans(ans)

            if ans not in ans_to_ix:
                ix_to_ans[ans_to_ix.__len__()] = ans
                ans_to_ix[ans] = ans_to_ix.__len__()

        return ans_to_ix, ix_to_ans
Esempio n. 6
0
    def proc_ans_tokens(self, ans, token_to_ix_ans, max_token):

        ans_ix = np.zeros(max_token, np.int64)

        words = prep_ans(ans['multiple_choice_answer']).split()

        for ix, word in enumerate(words):
            if word in token_to_ix_ans:
                ans_ix[ix] = token_to_ix_ans[word]
            else:
                ans_ix[ix] = token_to_ix_ans['UNK']

            if ix + 1 == max_token:
                break

        return ans_ix
Esempio n. 7
0
    def proc_ans(self, ans, ans_to_ix):
        ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
        ans_prob_dict = {}

        for ans_ in ans['answers']:
            ans_proc = prep_ans(ans_['answer'])
            if ans_proc not in ans_prob_dict:
                ans_prob_dict[ans_proc] = 1
            else:
                ans_prob_dict[ans_proc] += 1

        for ans_ in ans_prob_dict:
            if ans_ in ans_to_ix:
                ans_score[ans_to_ix[ans_]] = self.get_score(ans_prob_dict[ans_])

        return ans_score
def ans_stat(stat_ans_list):
    ans_to_ix = {}
    ix_to_ans = {}
    ans_freq_dict = {}

    for ans in stat_ans_list:
        ans_proc = prep_ans(ans['multiple_choice_answer'])
        if ans_proc not in ans_freq_dict:
            ans_freq_dict[ans_proc] = 1
        else:
            ans_freq_dict[ans_proc] += 1

    ans_freq_filter = ans_freq_dict.copy()
    for ans in ans_freq_dict:
        if ans_freq_dict[ans] <= 8:
            ans_freq_filter.pop(ans)

    for ans in ans_freq_filter:
        ix_to_ans[ans_to_ix.__len__()] = ans
        ans_to_ix[ans] = ans_to_ix.__len__()

    return ans_to_ix, ix_to_ans
Esempio n. 9
0
    def proc_ans(self, ans, ans_to_ix):
        ans_ix = np.zeros(1, np.int64)
        ans = prep_ans(ans)
        ans_ix[0] = ans_to_ix[ans]

        return ans_ix