def select_question_types(sample_probs, selected_answer,
                          num_sample_style=2,
                          max_sample_times=20):
    (answer_text, char_st, char_ed, st, ed, answer_bio_ids, answer_pos_tag, answer_ner_tag) = selected_answer
    a_tag = "-".join([answer_pos_tag, answer_ner_tag])

    # get s probs
    styles = QUESTION_TYPES  # question types
    s_probs = []
    for s in QUESTION_TYPES:
        s_condition = "_".join([s, a_tag])
        if s_condition in sample_probs["s|c,a"]:
            s_probs.append(sample_probs["s|c,a"][s_condition])
        else:
            s_probs.append(1)

    # sample s
    sampled_styles = []
    sample_times = 0
    for sample_times in range(max_sample_times):
        sampled_s = weighted_sample(styles, s_probs)
        if sampled_s not in sampled_styles:
            sampled_styles.append(sampled_s)
        if len(sampled_styles) >= num_sample_style:
            break

    return sampled_styles
def select_clues(chunklist, doc, sample_probs, selected_answer,
                 num_sample_clue=2,
                 clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20,
                 max_sample_times=20):
    (answer_text, char_st, char_ed, st, ed, answer_bio_ids, answer_pos_tag, answer_ner_tag) = selected_answer

    doc_token_list = [token for token in doc]  # doc is sentence_doc
    idx2token, idx2related, context_tokens = get_all_related(doc, doc_token_list)

    # get c probs
    c_probs = []
    for chunk in chunklist:
        chunk_pos_tag = chunk[1]
        chunk_ner_tag = chunk[0]
        c_tag = "-".join([chunk_pos_tag, chunk_ner_tag])

        answer_start = st
        clue_start = chunk[3]
        clue_end = chunk[4]
        clue_answer_dep_path_len = abs(clue_start - answer_start)
        answer_related = idx2related[answer_start]
        for tk_id, path in answer_related:
            if tk_id == clue_start:
                clue_answer_dep_path_len = len(path)
        dep_dist = clue_answer_dep_path_len

        dep_dist_bin = val2bin(dep_dist, clue_dep_dist_min_val, clue_dep_dist_max_val, clue_dep_dist_bin_width)

        c_condition = "_".join([c_tag, str(dep_dist_bin)])  # condition of p(c|...)
        if c_condition in sample_probs["c|a"] and chunk[2][0].lower() not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE:
            c_probs.append(sample_probs["c|a"][c_condition])
        else:
            c_probs.append(1)

    # sample c
    chunk_ids = list(range(len(chunklist)))
    sampled_clue_chunk_ids = []
    sample_times = 0
    for sample_times in range(max_sample_times):
        sampled_chunk_id = weighted_sample(chunk_ids, c_probs)
        if sampled_chunk_id not in sampled_clue_chunk_ids:
            sampled_clue_chunk_ids.append(sampled_chunk_id)
        if len(sampled_clue_chunk_ids) >= num_sample_clue:
            break

    sampled_clues = []
    for chunk_id in sampled_clue_chunk_ids:
        chunk = chunklist[chunk_id]
        clue_start = chunk[3]
        clue_end = chunk[4]
        clue_text = ' '.join(context_tokens[clue_start:clue_end + 1])
        clue_binary_ids = [0] * len(doc_token_list)
        clue_binary_ids[clue_start:clue_end + 1] = [1] * (clue_end - clue_start + 1)
        clue = {"clue_text": clue_text, "clue_binary_ids": clue_binary_ids}
        sampled_clues.append(clue)

    return sampled_clues
def select_answers(sentence, sample_probs,
                   num_sample_answer=5,
                   answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30,
                   max_sample_times=20, original_answer=None):
    # get all chunks
    chunklist, tree, doc = get_chunks(sentence)
    token2idx, idx2token = get_token2char(doc)

    # If the original answer was provided, choose a chunk of text the most similar to it
    if original_answer is not None:
        return select_most_similar_answer(
            sample_probs, chunklist, tree, doc, token2idx, idx2token, sentence, original_answer
        )

    # sample answer chunk
    chunk_ids = list(range(len(chunklist)))
    a_probs = compute_answers_probs(
        sample_probs, chunklist, answer_length_bin_width, answer_length_min_val, answer_length_max_val
    )

    sampled_answer_chunk_ids = []
    # sample_times = 0
    for sample_times in range(max_sample_times):
        sampled_chunk_id = weighted_sample(chunk_ids, a_probs)
        if sampled_chunk_id not in sampled_answer_chunk_ids:
            sampled_answer_chunk_ids.append(sampled_chunk_id)
        if len(sampled_answer_chunk_ids) >= num_sample_answer:
            break

    sampled_answers = []
    for chunk_id in sampled_answer_chunk_ids:
        chunk = chunklist[chunk_id]
        chunk_ner_tag, chunk_pos_tag, leaves, st, ed = chunk
        try:
            context = sentence
            char_st, char_ed = str_find(context, leaves)
            if char_st < 0:
                continue
            answer_text = context[char_st:char_ed + 1]
            st = idx2token[char_st]
            ed = idx2token[char_ed]
            answer_bio_ids = ['O'] * len(doc)
            answer_bio_ids[st: ed + 1] = ['I'] * (ed - st + 1)
            answer_bio_ids[st] = 'B'
            char_st = token2idx[st][0]
            char_ed = token2idx[ed][1]
            sampled_answers.append(
                (answer_text, char_st, char_ed, st, ed, answer_bio_ids, chunk_pos_tag, chunk_ner_tag))
        except:
            continue

    return sampled_answers, chunklist, tree, doc
    print(example)
    print("... above tests get_answer_clue_style_info\n")

    # test get_sample_probs
    filename = "../../../../../Datasets/original/SQuAD1.1-Zhou/train.txt"
    filetype = "squad"
    save_dataset_info_file = "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_ans_clue_style_info.pkl"
    save_sample_probs_file = "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_sample_probs.pkl"
    sample_probs = get_sample_probs(
        filename, filetype, save_dataset_info_file, save_sample_probs_file,
        sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20,
        debug=True, debug_length=20,
        answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30,
        clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0, clue_dep_dist_max_val=20)
    print("... above tests get_sample_probs\n")
    # NOTICE: we have run for non debug mode on squad. The files are saved to:
    # "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_ans_clue_style_info_full_train.pkl"
    # "../../../../../Datasets/processed/SQuAD1.1-Zhou/squad_sample_probs_full_train.pkl"
    # use these files to plot figures.

    # test weighted_sample
    result = [weighted_sample(["a", "b"], [0.5, 0.5]) for _ in range(30)]
    print(result)
    print("... above tests weighted_sample\n")

    # test augment_qg_data
    result = augment_qg_data("Bob is eating a delicious cake in Vancouver.", sample_probs)
    print("sampled result is:    ")
    print(result)
    print("... above tests augment_qg_data\n")
Exemple #5
0
def select_answers(sentence,
                   sample_probs,
                   num_sample_answer=5,
                   answer_length_bin_width=3,
                   answer_length_min_val=0,
                   answer_length_max_val=30,
                   max_sample_times=20,
                   original_answer=None):
    # get all chunks
    chunklist, tree, doc = get_chunks(sentence)
    token2idx, idx2token = get_token2char(doc)

    if original_answer is not None:
        return select_most_similar_answer(chunklist, tree, doc, token2idx,
                                          idx2token, sentence, original_answer)

    # sample answer chunk
    chunk_ids = list(range(len(chunklist)))
    a_probs = []
    for chunk in chunklist:
        chunk_pos_tag = chunk[1]
        chunk_ner_tag = chunk[0]
        a_tag = "-".join([chunk_pos_tag, chunk_ner_tag])
        a_length = abs(chunk[3] - chunk[4] + 1)
        a_length_bin = val2bin(a_length, answer_length_min_val,
                               answer_length_max_val, answer_length_bin_width)
        a_condition = "_".join([a_tag,
                                str(a_length_bin)])  # condition of p(a|...)
        if a_condition in sample_probs["a"] and chunk[2][0].lower(
        ) not in NOT_BEGIN_TOKENS_FOR_ANSWER_CLUE:
            a_probs.append(sample_probs["a"][a_condition])
        else:
            a_probs.append(1)

    sampled_answer_chunk_ids = []
    sample_times = 0
    for sample_times in range(max_sample_times):
        sampled_chunk_id = weighted_sample(chunk_ids, a_probs)
        if sampled_chunk_id not in sampled_answer_chunk_ids:
            sampled_answer_chunk_ids.append(sampled_chunk_id)
        if len(sampled_answer_chunk_ids) >= num_sample_answer:
            break

    sampled_answers = []
    for chunk_id in sampled_answer_chunk_ids:
        chunk = chunklist[chunk_id]
        chunk_ner_tag, chunk_pos_tag, leaves, st, ed = chunk
        try:
            context = sentence
            char_st, char_ed = str_find(context, leaves)
            if char_st < 0:
                continue
            answer_text = context[char_st:char_ed + 1]
            st = idx2token[char_st]
            ed = idx2token[char_ed]
            answer_bio_ids = ['O'] * len(doc)
            answer_bio_ids[st:ed + 1] = ['I'] * (ed - st + 1)
            answer_bio_ids[st] = 'B'
            char_st = token2idx[st][0]
            char_ed = token2idx[ed][1]
            sampled_answers.append(
                (answer_text, char_st, char_ed, st, ed, answer_bio_ids,
                 chunk_pos_tag, chunk_ner_tag))
        except:
            continue

    return sampled_answers, chunklist, tree, doc