Ejemplo n.º 1
0
def ans_stat(stat_ans_list):
    ans_to_ix = {}
    ix_to_ans = {}
    ans_freq_dict = {}

    # 统计每个答案出现的次数
    for ans in stat_ans_list:
        ans_proc = prep_ans(ans['multiple_choice_answer'])
        if ans_proc not in ans_freq_dict:
            ans_freq_dict[ans_proc] = 1
        else:
            ans_freq_dict[ans_proc] += 1

    # 如果某个答案出现次数<=8就pop掉
    ans_freq_filter = ans_freq_dict.copy()
    for ans in ans_freq_dict:
        if ans_freq_dict[ans] <= 8:
            ans_freq_filter.pop(ans)

    # ix_to_ans[0] = 'net'
    # ans_to_ix['net'] = 0
    for ans in ans_freq_filter:
        ix_to_ans[ans_to_ix.__len__()] = ans
        ans_to_ix[ans] = ans_to_ix.__len__()

    return ans_to_ix, ix_to_ans
Ejemplo n.º 2
0
def get_top_answers(examples, occurs=0):
    """
    Extract all of correct answers in the dataset. Build a set of possible answers which
    appear more than pre-defined "occurs" times.
    --------------------
    Arguments:
        examples (list): the json data loaded from disk.
        occurs (int): a threshold that determine which answers are kept.
    Return:
        vocab_ans (list): a set of correct answers in the dataset.
    """
    counter = Counter()
    for ex in examples:
        for ans in ex["mc_ans"]:
            ans = str(ans).lower()
            ans_proc = prep_ans(ans)
            counter.update([ans_proc])

    frequent_answers = list(filter(lambda x: x[1] > occurs, counter.items()))
    total_ans = sum(item[1] for item in counter.items())
    total_freq_ans = sum(item[1] for item in frequent_answers)

    print("Number of unique answers:", len(counter))
    print("Total number of answers:", total_ans)
    print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
    print("Sample frequent answers:")
    print("\n".join(map(str, frequent_answers[:20])))

    vocab_ans = []
    for item in frequent_answers:
        vocab_ans.append(item[0])

    return vocab_ans
Ejemplo n.º 3
0
def proc_ans_oe(ques, ans_to_ix):
    answers = ques['ans_score']

    ans_score = np.zeros(ans_to_ix.__len__(), np.float32)

    for ans in answers:
        ans_proc = prep_ans(ans[0])
        ans_score[ans_to_ix[ans_proc]] = ans[1]

    return ans_score
Ejemplo n.º 4
0
def filter_answers(examples, ans2idx):
    """
    Remove the answers that don't appear in our answer set.
    --------------------
    Arguments:
        examples (list): the json data that contains all of answers in the dataset.
        ans2idx (dict): a set of considered answers.
    Return:
        examples (list): the processed json data which contains only answers in the answer set.
    """
    for ex in examples:
        ex["ans_score"] = [list(filter(lambda x: prep_ans(x[0]) in ans2idx, answers)) for answers in ex["ans_score"]]

    return examples
Ejemplo n.º 5
0
def proc_ans(ans, ans_to_ix):
    ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
    ans_prob_dict = {}

    for ans_ in ans['answers']:
        ans_proc = prep_ans(ans_['answer'])
        if ans_proc not in ans_prob_dict:
            ans_prob_dict[ans_proc] = 1
        else:
            ans_prob_dict[ans_proc] += 1

    for ans_ in ans_prob_dict:
        if ans_ in ans_to_ix:
            ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_])

    return ans_score
Ejemplo n.º 6
0
    def proc_ans_and_abs(self, ans):
        ans_to_ix = self.ans_to_ix
        abs_to_ix = self.abs_to_ix
        ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
        abs_score = np.zeros(abs_to_ix.__len__(), np.float32)
        ans_group = np.zeros(ans_to_ix.__len__(), np.bool)
        abs_group = np.zeros(abs_to_ix.__len__(), np.bool)

        ans_prob_dict = {}
        # process ans
        for ans_ in ans['answers']:
            ans_proc = prep_ans(ans_['answer'])
            if ans_proc not in ans_prob_dict:
                ans_prob_dict[ans_proc] = 1
            else:
                ans_prob_dict[ans_proc] += 1

        for ans_ in ans_prob_dict:
            if ans_ in ans_to_ix:
                ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_])

        # process abstraction
        ans_appear_most = sorted(ans_prob_dict.items(),
                                 key=lambda x: -1 * x[1])[0][0]

        if ans_appear_most in ans_to_ix:
            # from top to down
            abspath = self.ans_to_abspath[ans_appear_most]
            # Select groups for computing losses
            if len(abspath) != 0:
                for abs_ in abspath[1:]:
                    abs_score[abs_to_ix[abs_]] = 1.0

                for x in abspath:
                    children = self.abs_tree[x]
                    if children[0] in ans_to_ix:
                        ids = [ans_to_ix[a] for a in children]
                        ans_group[ids] = True
                    else:
                        ids = [abs_to_ix[a] for a in children]
                        abs_group[ids] = True

                return ans_score, abs_score, (abs_group, ans_group)

        return ans_score, abs_score,\
               (np.zeros(abs_to_ix.__len__(), np.bool),\
                np.ones(ans_to_ix.__len__(), np.bool))
Ejemplo n.º 7
0
def proc_ans_mc(ques, ans_to_ix, token_to_ix):
    answers = ques['ans_score']
    mcs = ques['mc']
    ans_gt = ques['mc_ans']

    ans_label = np.zeros(18, np.float32)
    ans_mc_ix = np.zeros((18, 4), np.int)

    for index in range(len(mcs)):
        if mcs[index].lower() == ans_gt.lower():
            ans_label[index] = 1.0

        words = re.sub(
            r"([.,'!?\"()*#:;])",
            '',
            mcs[index].lower()
        ).replace('-', ' ').replace('/', ' ').split()

        for ix, word in enumerate(words):
            if word in token_to_ix:
                ans_mc_ix[ix] = token_to_ix[word]
            else:
                ans_mc_ix[ix] = token_to_ix['UNK']

            if ix + 1 == 4:
                break

    assert sum(ans_label) == 1.0

    ans_ix = np.zeros(18, np.int)-1
    for index in range(len(mcs)):
        try:
            ans_ix[index] = ans_to_ix[mcs[index]]
        except:
            pass

    ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
    for ans in answers:
        ans_proc = prep_ans(ans[0])
        ans_score[ans_to_ix[ans_proc]] = ans[1]

    return ans_score, ans_label, ans_mc_ix, ans_ix
def proc_ans(ans, ans_to_ix):
    '''
    :param ans: 输入的答案
    :param ans_to_ix: {"ans":'ix'} 共3129个词,答案出现频率高的前3129个,
    :return: ans_core:答案的分数
    '''
    ans_score = np.zeros(ans_to_ix.__len__(), np.float32)
    ans_prob_dict = {}  #答案单词字典

    for ans_ in ans['answers']:
        ans_proc = prep_ans(ans_['answer'])
        if ans_proc not in ans_prob_dict:
            ans_prob_dict[ans_proc] = 1
        else:
            ans_prob_dict[ans_proc] += 1

    for ans_ in ans_prob_dict:
        if ans_ in ans_to_ix:
            ans_score[ans_to_ix[ans_]] = get_score(ans_prob_dict[ans_])

    return ans_score
Ejemplo n.º 9
0
def ans_stat(stat_ans_list):
    ans_to_ix = {}
    ix_to_ans = {}
    ans_freq_dict = {}

    for ans in stat_ans_list:
        ans_proc = prep_ans(ans['multiple_choice_answer'])
        if ans_proc not in ans_freq_dict:
            ans_freq_dict[ans_proc] = 1
        else:
            ans_freq_dict[ans_proc] += 1

    ans_freq_filter = ans_freq_dict.copy()
    for ans in ans_freq_dict:
        if ans_freq_dict[ans] <= 8:
            ans_freq_filter.pop(ans)

    for ans in ans_freq_filter:
        ix_to_ans[ans_to_ix.__len__()] = ans
        ans_to_ix[ans] = ans_to_ix.__len__()

    return ans_to_ix, ix_to_ans