Exemple #1
0
def get_dataset_info(filename,
                     filetype,
                     save_file=None,
                     sent_limit=100,
                     ques_limit=50,
                     answer_limit=30,
                     is_clue_topN=20,
                     debug=False,
                     debug_length=20):
    raw_examples = FQG_data.get_raw_examples(filename, filetype, debug,
                                             debug_length)
    examples_with_info = []
    for i in range(len(raw_examples)):
        e = raw_examples[i]
        sentence = e["ans_sent"]
        question = e["question"]
        answer = e["answer_text"]
        answer_start = e["answer_start"]
        new_e = get_answer_clue_style_info(sentence, question, answer,
                                           answer_start, sent_limit,
                                           ques_limit, answer_limit,
                                           is_clue_topN)
        examples_with_info.append(new_e)
        # print(new_e)  #!!!
        if debug and i >= debug_length:
            break
    if save_file is None:
        save_file = file_type + "_answer_clue_style_info.pkl"
    save(save_file, examples_with_info)
    return examples_with_info
Exemple #2
0
def sentences2augmented_sentences(input_path,
                                  output_path,
                                  start_index,
                                  end_index,
                                  sample_probs,
                                  num_sample_answer=5,
                                  num_sample_clue=2,
                                  num_sample_style=2,
                                  max_sample_times=20):
    augmented_sentences = []
    with codecs.open(input_path, "r", encoding='utf8') as infile:
        sentences = infile.readlines()
        assert start_index < end_index
        assert start_index < len(sentences)
        assert end_index <= len(sentences)
        print("Start augment data...")
        for i in range(start_index, end_index):
            print(i)
            s_split = sentences[i].rstrip().split("\t")
            pid = s_split[0]
            sid = s_split[1]
            s = s_split[2]
            # augmented_s = augment_qg_data(s)  # NOTICE: for FQG_data_augmentor_old
            augmented_s = augment_qg_data(s, sample_probs, num_sample_answer,
                                          num_sample_clue, num_sample_style,
                                          max_sample_times)
            augmented_s["pid"] = pid
            augmented_s["sid"] = sid
            augmented_sentences.append(augmented_s)
    save(output_path, augmented_sentences, "save augmented sentences...")
    infile.close()
Exemple #3
0
def get_answertag2qtype_mapping(answertag2qtype_dict_file, data_file,
                                data_type):
    """
    Get the mapping between (answer_tags, potential question types).
    We either load a saved dictionary which we calculated and saved before,
    or we create such a dict by analyzing reference_file and save it for future usage.
    :param answertag2qtype_dict_file: we will save the result to this file.
    :param data_file: such as SQuAD data file. We use it to get the mapping.
    :param data_type: SQuAD or NewsQA. See get_raw_examples in FQG_data.py
    :return: a dict maps answer text tags (from the function get_answer_chunk_tags) to question types set.
    """
    examples = get_raw_examples(data_file, data_type)
    answertag2qtype = {}
    i = 0
    for e in examples:
        try:
            context_text = e["ans_sent"]
            answer_start = e["answer_start"]
            answer_text = e["answer_text"]
            answer_end = e["answer_start"] + len(answer_text) - 1
            question = e["question"]
            chunk_tag = get_answer_chunk_tag(context_text, answer_start,
                                             answer_end)
            ner_tag = get_answer_ner_tag(context_text, answer_text)
            answertag = "-".join([chunk_tag, ner_tag])
            qtype, qtype_id = get_question_type(question)
            if answertag in answertag2qtype:
                answertag2qtype[answertag].append(qtype)
            else:
                answertag2qtype[answertag] = [qtype]
        except:
            continue
        i = i + 1
        print(i)
        # if i > 20:
        #     break  # for debug

    answertag2qtype_set = {}
    answertag2qtype_counter = {}
    for k in answertag2qtype:
        answertag2qtype_set[k] = set(answertag2qtype[k])
        answertag2qtype_counter[k] = Counter(answertag2qtype[k])
    result = {
        "answertag2qtype": answertag2qtype,
        "answertag2qtype_set": answertag2qtype_set,
        "answertag2qtype_counter": answertag2qtype_counter
    }
    save(answertag2qtype_dict_file,
         result,
         message="save answertag2qtype dict")
    print(answertag2qtype_set)
    print(answertag2qtype_counter)
    return answertag2qtype_set
def get_sample_probs(filename, filetype, save_dataset_info_file=None, save_sample_probs_file=None,
                     sent_limit=100, ques_limit=50, answer_limit=30, is_clue_topN=20,
                     debug=False, debug_length=20,
                     answer_length_bin_width=3, answer_length_min_val=0, answer_length_max_val=30,
                     clue_dep_dist_bin_width=2, clue_dep_dist_min_val=0,
                     clue_dep_dist_max_val=100):  # !!!!!!!!!!!!! maybe 20 makes the prob be big for clue_dep > 20... set it as a big value like 100?
    """
    P(a, c, s) = p(a) * p(c|a) * p(s|c, a)
               = p(a|a_tag, a_length) * p(c|c_tag, dep_dist) * p(s|a_tag)
    """
    examples_with_info = get_dataset_info(
        filename, filetype, save_dataset_info_file,
        sent_limit, ques_limit, answer_limit, is_clue_topN,
        debug, debug_length)

    sla_tag = []  # for p(s|a_tag).  here we use "l" to denote "|"
    clc_tag_dep_dist = []  # for p(c|c_tag, dep_dist).  here we use "l" to denote "|"
    ala_tag_a_length = []  # for p(a|a_tag, a_length).  here we use "l" to denote "|"

    print(f"\n[DEBUG] in get_sample_probs,  examples: {len(examples_with_info)}")
    utu = 0
    for e in examples_with_info:
        if utu <= 10 or (utu <= 100 and utu % 10 == 0) or (utu <= 1000 and utu % 100 == 0) or (utu % 1000 == 0):
            print(f"[DEBUG] {utu}/{len(examples_with_info)}")
        a_tag = "-".join([e["answer_pos_tag"], e["answer_ner_tag"]])  # answer tag
        s = e["question_type"][0]  # question style (type)
        a_length = e["answer_length"]
        a_length_bin = val2bin(a_length, answer_length_min_val, answer_length_max_val, answer_length_bin_width)
        c_tag = "-".join([e["clue_pos_tag"], e["clue_ner_tag"]])
        dep_dist = e["clue_answer_dep_path_len"]
        dep_dist_bin = val2bin(dep_dist, clue_dep_dist_min_val, clue_dep_dist_max_val, clue_dep_dist_bin_width)

        sla_tag.append("_".join([s, a_tag]))
        clc_tag_dep_dist.append("_".join([c_tag, str(dep_dist_bin)]))
        ala_tag_a_length.append("_".join([a_tag, str(a_length_bin)]))
    sla_tag = Counter(sla_tag)
    clc_tag_dep_dist = Counter(clc_tag_dep_dist)
    ala_tag_a_length = Counter(ala_tag_a_length)
    sample_probs = {
        "a": ala_tag_a_length,
        "c|a": clc_tag_dep_dist,
        "s|c,a": sla_tag}
    if save_sample_probs_file is None:
        save_sample_probs_file = filetype + "_sample_probs.pkl"
    save(save_sample_probs_file, sample_probs)
    print("\n[DEBUG] return probs")
    return sample_probs
Exemple #5
0
def prepro(config, augmented_sentences_pkl_file,
           processed_augmented_sentences_pkl_file):
    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get train spacy processed examples and counters
    examples = load(augmented_sentences_pkl_file)
    examples = get_spacy_processed_examples(config,
                                            examples,
                                            debug,
                                            debug_length,
                                            shuffle=False)

    # get emb_mats and emb_dicts
    emb_dicts = load(config.emb_dicts_file)

    # get featured examples
    examples = get_featured_examples(config, examples, emb_dicts)
    save(processed_augmented_sentences_pkl_file,
         examples,
         message="processed_augmented_sentences_pkl_file")
Exemple #6
0
def prepro(config):
    emb_tags = config.emb_config.keys()
    emb_config = config.emb_config
    emb_mats = {}
    emb_dicts = {}

    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get train spacy processed examples and counters
    if not config.processed_by_spacy and not config.processed_example_features:
        train_examples = get_raw_examples(config.train_file, config.data_type,
                                          debug, debug_length)
        train_examples, train_meta, train_eval = get_spacy_processed_examples(
            config, train_examples, debug, debug_length, shuffle=False)

        dev_examples = get_raw_examples(config.dev_file, config.data_type,
                                        debug, debug_length)
        dev_examples, dev_meta, dev_eval = get_spacy_processed_examples(
            config, dev_examples, debug, debug_length, shuffle=False)

        test_examples = get_raw_examples(config.test_file, config.data_type,
                                         debug, debug_length)
        test_examples, test_meta, test_eval = get_spacy_processed_examples(
            config, test_examples, debug, debug_length, shuffle=False)

        counters = get_updated_counters_by_examples(config,
                                                    None,
                                                    train_examples,
                                                    increment=1,
                                                    init=True,
                                                    finish=True)
        # only use train data
        final_counters = copy.deepcopy(counters)

        save(config.train_examples_file,
             train_examples,
             message="train examples")
        save(config.dev_examples_file, dev_examples, message="dev examples")
        save(config.test_examples_file, test_examples, message="test examples")
        save(config.train_meta_file, train_meta, message="train meta")
        save(config.dev_meta_file, dev_meta, message="dev meta")
        save(config.test_meta_file, test_meta, message="test meta")
        save(config.train_eval_file, train_eval, message="train eval")
        save(config.dev_eval_file, dev_eval, message="dev eval")
        save(config.test_eval_file, test_eval, message="test eval")
        save(config.counters_file, final_counters, message="counters")
    else:
        train_examples = load(config.train_examples_file)
        train_meta = load(config.train_meta_file)
        train_eval = load(config.train_eval_file)

        dev_examples = load(config.dev_examples_file)
        dev_meta = load(config.dev_meta_file)
        dev_eval = load(config.dev_eval_file)

        test_examples = load(config.test_examples_file)
        test_meta = load(config.test_meta_file)
        test_eval = load(config.test_eval_file)

        final_counters = load(config.counters_file)
        counters = final_counters

    # get emb_mats and emb_dicts
    if not config.processed_emb:
        for tag in emb_tags:
            emb_mats[tag], emb_dicts[tag] = get_embedding(
                final_counters[tag],
                tag,
                emb_file=emb_config[tag]["emb_file"],
                size=emb_config[tag]["emb_size"],
                vec_size=emb_config[tag]["emb_dim"])
        save(config.emb_mats_file, emb_mats, message="embedding mats")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")
    else:
        emb_mats = load(config.emb_mats_file)
        emb_dicts = load(config.emb_dicts_file)
    for k in emb_dicts:
        print("Embedding dict length: " + k + " " + str(len(emb_dicts[k])))

    # get related_words_dict and related_words_ids_mat
    if not config.processed_related_words:
        related_words_dict = get_related_words_dict(
            list(emb_dicts["word"].keys()), config.max_topN)
        related_words_ids_mat = get_related_words_ids_mat_with_related_words_dict(
            emb_dicts["word"], config.max_topN, related_words_dict)
        save(config.related_words_dict_file,
             related_words_dict,
             message="related words dict")
        save(config.related_words_ids_mat_file,
             related_words_ids_mat,
             message="related words ids mat")
    else:
        related_words_dict = load(config.related_words_dict_file)
        related_words_ids_mat = load(config.related_words_ids_mat_file)

    # get featured examples
    # TODO: handle potential insert SOS EOS problem when extracting tag features
    if not config.processed_example_features:
        train_examples, train_meta = get_featured_examples(
            config, train_examples, train_meta, "train", emb_dicts,
            related_words_ids_mat, related_words_dict)
        dev_examples, dev_meta = get_featured_examples(config, dev_examples,
                                                       dev_meta, "dev",
                                                       emb_dicts,
                                                       related_words_ids_mat,
                                                       related_words_dict)
        test_examples, test_meta = get_featured_examples(
            config, test_examples, test_meta, "test", emb_dicts,
            related_words_ids_mat, related_words_dict)

        save(config.train_examples_file,
             train_examples,
             message="train examples")
        save(config.dev_examples_file, dev_examples, message="dev examples")
        save(config.test_examples_file, test_examples, message="test examples")
        save(config.train_meta_file, train_meta, message="train meta")
        save(config.dev_meta_file, dev_meta, message="dev meta")
        save(config.test_meta_file, test_meta, message="test meta")
        save(config.train_eval_file, train_eval, message="train eval")
        save(config.dev_eval_file, dev_eval, message="dev eval")
        save(config.test_eval_file, test_eval, message="test eval")
    else:
        train_examples = load(config.train_examples_file)
        train_meta = load(config.train_meta_file)
        train_eval = load(config.train_eval_file)
        dev_examples = load(config.dev_examples_file)
        dev_meta = load(config.dev_meta_file)
        dev_eval = load(config.dev_eval_file)
        test_examples = load(config.test_examples_file)
        test_meta = load(config.test_meta_file)
        test_eval = load(config.test_eval_file)

    # print to txt to debug
    """
Exemple #7
0
def prepro(config):
    emb_tags = config.emb_tags
    emb_config = config.emb_config
    emb_mats = {}
    emb_dicts = {}

    debug = config.debug
    debug_length = config.debug_batchnum * config.batch_size

    # get examples and counters
    if not config.processed_example_features:
        examples = get_raw_examples(config, config.train_file, debug,
                                    debug_length)
        examples = get_featured_examples(config, examples)
        counters = get_counters(examples, config.emb_tags,
                                config.emb_not_count_tags)

        save(config.train_examples_file, (examples, 0), message="examples")
        save(config.counters_file, counters, message="counters")
    else:
        examples, num_relations = load(config.train_examples_file)
        counters = load(config.counters_file)

    # get emb_mats and emb_dicts
    if not config.processed_emb:
        for tag in emb_tags:
            emb_mats[tag], emb_dicts[tag] = get_embedding(
                counters[tag],
                tag,
                emb_file=emb_config[tag]["emb_file"],
                size=emb_config[tag]["emb_size"],
                vec_size=emb_config[tag]["emb_dim"])
        save(config.emb_mats_file, emb_mats, message="embedding mats")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")
    else:
        emb_mats = load(config.emb_mats_file)
        emb_dicts = load(config.emb_dicts_file)
    for k in emb_dicts:
        print("Embedding dict length: " + k + " " + str(len(emb_dicts[k])))

    if not config.processed_example_graph_features:
        # NOTICE: we should set update_edge_types2ids = True only for train dataset
        #if config.processed_emb and "edge_types" in emb_dicts:
        #    edge_types2ids = emb_dicts["edge_types"]
        #else:
        edge_types2ids = {}
        examples, num_relations, edge_types2ids = get_graph_examples(
            config,
            examples,
            config.edge_types_list,
            emb_dicts,
            edge_types2ids,
            update_edge_types2ids=True)
        emb_dicts["edge_types"] = edge_types2ids
        save(config.train_examples_file, (examples, num_relations),
             message="examples")
        save(config.emb_dicts_file, emb_dicts, message="embedding dicts")

    # print to txt to debug
    for k in emb_dicts:
        write_dict(emb_dicts[k],
                   OUTPUT_PATH + "debug/emb_dicts_" + str(k) + ".txt")
    for k in counters:
        write_counter(counters[k],
                      OUTPUT_PATH + "debug/counters_" + str(k) + ".txt")
    write_example(examples[5], OUTPUT_PATH + "debug/example.txt")