Beispiel #1
0
def exp2_transformer(in_file, out_file, logger):
    new_data = {}
    new_data["experiment"] = 1
    counter = 0
    with open(in_file, "r") as fh:
        logger.info(f"Importing {fh.name}")
        source = json.load(fh)
        new_data["version"] = source["version"]
        new_data["data"] = []
        logger.info("Creating all context list")
        for topic_id, topic in tqdm(enumerate(source["data"])):
            context_buffer = 0
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["qas"] = []
            topic_contexts = []
            # topic_contexts = [para["context"] for para in topic["paragraphs"]]
            num_pars = 0

            for para in topic["paragraphs"]:

                if num_pars >= 10:
                    break

                num_pars += 1
                topic_contexts.append(para['context'])
                paragraph = {}
                paragraph["qas"] = []
                for qas in para['qas']:
                    counter += 1
                    qas_dict = {}
                    qas_dict["topic_id"] = topic_id
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict[
                                "answer_start"] = context_buffer + answer[
                                    "answer_start"]
                            answer_dict["text"] = answer["text"]

                            qas_dict["answers"].append(answer_dict)
                    topic_dict["qas"].append(qas_dict)
                context_buffer += len(para['context']) + 1

            topic_contexts = " ".join(topic_contexts)
            topic_dict["context"] = topic_contexts

            new_data["data"].append(topic_dict)

    logger.info(f"Processed {counter} question, answer pairs")
    logger.info(f"Saving to {out_file}")
    save(filename=out_file, obj=new_data)
Beispiel #2
0
def exp2_transformer(in_file, out_file):
    """
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = 2
    with open(in_file, "r") as fh:
        fancyprint(in_str=("Importing: " + in_file))
        source = json.load(fh)
        fancyprint(in_str="Converting into experiment 2 format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            # merge the contexts within each topic into a giant string
            # save the topic_context above the paragraphs
            topic_dict["topic_context"] = "".join([
                quick_clean(raw_str=para["context"])
                for para in topic["paragraphs"]
            ])
            context_buffer = 0
            topic_dict["qas"] = []
            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            # update the answer start index
                            answer_dict["answer_start"] = answer[
                                "answer_start"] + context_buffer
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    topic_dict["qas"].append(qas_dict)
                context_buffer += len(para["context"])
            new_data["data"].append(topic_dict)

    save(filename=out_file, obj=new_data, message="saving experiment 2 data")
def exp_1_transformer(in_file, out_file, logger):
    new_data = {}
    new_data["experiment"] = 1
    counter = 0
    with open(in_file, "r") as fh:
        logger.info(f"Importing {fh.name}")
        source = json.load(fh)
        new_data["version"] = source["version"]
        new_data["data"] = []
        logger.info("Creating all context list")
        all_contexts = [
            para["context"] for topic in source["data"]
            for para in topic["paragraphs"]
        ]
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"], context_buffer = get_new_context(
                    orig_context=para["context"], all_contexts=all_contexts)
                paragraph["qas"] = []
                for qas in para['qas']:
                    counter += 1
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict[
                                "answer_start"] = context_buffer + answer[
                                    "answer_start"]
                            answer_dict["text"] = answer["text"]

                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)
                topic_dict["paragraphs"].append(paragraph)
            new_data["data"].append(topic_dict)

    logger.info(f"Processed {counter} question, answer pairs")
    logger.info(f"Saving to {out_file}")
    save(filename=out_file, obj=new_data)
def toy_transformer(in_file, train_file, dev_file, test_file, train_topic_num,
                    dev_topic_num, test_topic_num, logger):
    """
    distill original data into at most 15 topics, with each having at most 5 paragraphs,
    each of which has 5 questions and 5 answers
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = "toy"
    new_dev_data = {}
    new_dev_data['experiment'] = "toy_dev"
    new_test_data = {}
    new_test_data['experiment'] = "toy_train"
    with open(in_file, "r") as fh:
        logger.info(f"Importing: {in_file}")
        source = json.load(fh)
        logger.info("Converting into toy format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        new_dev_data["version"] = source["version"]
        new_dev_data["data"] = []
        new_test_data["version"] = source["version"]
        new_test_data["data"] = []
        topic_counter = train_topic_num
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"] = para["context"]
                paragraph["qas"] = []
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict["answer_start"] = answer[
                                "answer_start"]
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)
                topic_dict["paragraphs"].append(paragraph)

            if topic_counter >= 0:
                new_data["data"].append(topic_dict)
            elif topic_counter >= -1 * dev_topic_num:
                new_dev_data["data"].append(topic_dict)
            elif topic_counter >= -1 * (dev_topic_num + test_topic_num):
                new_test_data["data"].append(topic_dict)
            else:
                break

            topic_counter -= 1

    logger.info(f"Saving new data to {train_file}")
    save(filename=train_file, obj=new_data)
    logger.info(f"Saving new dev data to {dev_file}")
    save(filename=dev_file, obj=new_dev_data)
    logger.info(f"Saving new test data to {test_file}")
    save(filename=test_file, obj=new_test_data)
Beispiel #5
0
def exp3_transformer(in_file_1, in_file_2, train_out_file, test_out_file,
                     logger):
    """
    convert data into (question, topic_id, topic title) format
    args:
        - in_file: the file name of the data to be transformed to experiment 3 format
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    train_new_data = {}
    train_new_data["experiment"] = "exp3"
    test_new_data = {}
    test_new_data["experiment"] = "exp3"

    q_count = 0

    with open(in_file_1, "r") as fh:
        logger.info(f"Importing: {in_file_1}")
        source = json.load(fh)
        train_new_data["version"] = source["version"]
        train_new_data["data"] = []
        test_new_data["version"] = source["version"]
        test_new_data["data"] = []
        for topic_id, topic in tqdm(enumerate(source["data"])):
            test_count = 0
            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    if test_count < 5:
                        test_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                        test_count += 1
                    else:
                        train_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                    q_count += 1
        topic_id = len(source['data'])

    with open(in_file_2, "r") as fh:
        logger.info(f"Importing: {in_file_2}")
        source = json.load(fh)
        for topic in tqdm(source["data"]):
            test_count = 0

            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    if test_count < 5:
                        test_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                        test_count += 1
                    else:
                        train_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                    q_count += 1
            topic_id += 1


#    print('tests, should all be true')
#    print(topic_id)
#    print(len(train_new_data))
#    print(len(test_new_data))
#    print(len(train_new_data) == len(test_new_data))
#    print(train_new_data.keys() == test_new_data.keys())
#    print(len(train_new_data.keys()) == len(set(train_new_data.keys())))
#    print(len(test_new_data.keys()) == len(set(test_new_data.keys())))

    logger.info(f"Saving new train data to {train_out_file}")
    save(filename=train_out_file, obj=train_new_data)
    logger.info(f"Saving new test data to {test_out_file}")
    save(filename=test_out_file, obj=test_new_data)
        }, f"./data/torch-test-{data_type}")
    logger.info(f"Built and saved {total_}/{total} fully featurized examples")


if __name__ == "__main__":
    global nlp
    args = get_exp3_featurize_args()
    log = get_logger(log_dir=args.logging_dir, name="data-gen")
    nlp = spacy.blank("en")
    word_counter = Counter()
    examples, eval_examples, topic_title_id_map = pre_process(
        args=args,
        in_file=args.train_in_file,
        word_counter=word_counter,
        logger=log)
    save(filename=args.train_topic_title_id_map_file, obj=topic_title_id_map)
    save(filename=args.train_eval_file, obj=eval_examples)

    word_emb_mat, word2idx_dict = get_word_embedding(args=args,
                                                     counter=word_counter,
                                                     logger=log)
    save(args.word_emb_file, word_emb_mat)

    dev_examples, dev_eval_examples, dev_topic_title_id_map = pre_process(
        args=args,
        in_file=args.dev_in_file,
        word_counter=word_counter,
        logger=log)

    save(filename=args.dev_topic_title_id_map_file, obj=dev_topic_title_id_map)
    save(filename=args.dev_eval_file, obj=dev_eval_examples)
Beispiel #7
0
def pre_process(args, logger):
    # Process training set and use it to decide on the word/character vocabularies

    word_counter, char_counter = Counter(), Counter()
    examples, eval_obj = process_file(filename=args.train_data_exp1,
                                      data_type="train",
                                      word_counter=word_counter,
                                      char_counter=char_counter,
                                      chunk_size=args.chunk_size)
    save(args.train_eval_file, eval_obj)
    del eval_obj

    word_emb_mat, word2idx_dict = get_embedding(
        counter=word_counter,
        data_type='word',
        emb_file=args.glove_word_file,
        vec_size=args.glove_word_dim,
        num_vectors=args.glove_word_num_vecs)

    char_emb_mat, char2idx_dict = get_embedding(counter=char_counter,
                                                data_type='char',
                                                emb_file=None,
                                                vec_size=args.glove_char_dim)
    save(args.word_emb_file, word_emb_mat)
    save(args.char_emb_file, char_emb_mat)
    del word_emb_mat
    del char_emb_mat

    dev_examples, dev_eval = process_file_dev(filename=args.dev_data_exp1,
                                              data_type="dev",
                                              word_counter=word_counter,
                                              char_counter=char_counter)

    build_features(args=args,
                   examples=examples,
                   data_type="train",
                   out_file=args.train_record_file_exp1,
                   word2idx_dict=word2idx_dict,
                   char2idx_dict=char2idx_dict,
                   is_test=False,
                   chunk_size=args.chunk_size)

    del examples

    # Process dev and test sets
    dev_meta = build_features_dev(args=args,
                                  examples=dev_examples,
                                  data_type="dev",
                                  out_file=args.dev_record_file_exp1,
                                  word2idx_dict=word2idx_dict,
                                  char2idx_dict=char2idx_dict,
                                  is_test=False)
    save(args.dev_meta_file, dev_meta)
    save(args.dev_eval_file, dev_eval)
    del dev_meta
    del dev_eval

    #    test_examples, test_eval = process_file(filename=args.test_data_exp1,
    #                                            data_type="test",
    #                                            word_counter=word_counter,
    #                                            char_counter=char_counter,
    #                                            logger=logger)
    #    test_meta = build_features(args=args, examples=test_examples, data_type="test",
    #                               out_file=args.test_record_file_exp1, word2idx_dict=word2idx_dict,
    #                               char2idx_dict=char2idx_dict, is_test=True)
    #    save(args.test_meta_file, test_meta)
    #    save(args.test_eval_file, test_eval)

    save(args.word2idx_file, word2idx_dict)
    save(args.char2idx_file, char2idx_dict)
def pre_process(data, flags):
    """
    authors:
        @rohitmusti
        @chrischute
    """

    if flags[1] == "dev":
        exp3_data = data.dev_data_exp3
        eval_file = data.dev_eval_exp3
        record_file = data.dev_record_file_exp3

    elif flags[1] == "train":
        exp3_data = data.train_data_exp3
        eval_file = data.train_eval_exp3
        record_file = data.train_record_file_exp3

    elif flags[1] == "toy":
        exp3_data = data.toy_data_exp3
        eval_file = data.toy_eval_exp3
        record_file = data.toy_record_file_exp3
    else:
        print("Small error: no valid flags were passed in")
        print("Valid flags: dev, train, toy")

    # Process training set and use it to decide on the word/character vocabularies
    word_counter, char_counter = Counter(), Counter()

    examples, eval_obj = process_file(exp3_data, flags[1], word_counter,
                                      char_counter)

    save(eval_file, eval_obj, message=(flags[1] + " eval"))

    word_emb_mat, word2idx_dict = get_embedding(
        word_counter,
        'word',
        emb_file=data.glove_word_file,
        vec_size=data.glove_word_dim,
        num_vectors=data.glove_word_num_vecs)
    char_emb_mat, char2idx_dict = get_embedding(char_counter,
                                                'char',
                                                emb_file=data.glove_char_file,
                                                vec_size=data.char_emb_size)

    if flags[1] == "train":
        save(data.word_emb_file, word_emb_mat, message="word embedding")
        save(data.char_emb_file, char_emb_mat, message="char embedding")
        save(data.word2idx_file, word2idx_dict, message="word dictionary")
        save(data.char2idx_file, char2idx_dict, message="char dictionary")
    elif flags[1] == "toy":
        save(data.toy_word_emb_file, word_emb_mat, message="word embedding")
        save(data.toy_char_emb_file, char_emb_mat, message="char embedding")
        save(data.toy_word2idx_file, word2idx_dict, message="word dictionary")
        save(data.toy_char2idx_file, char2idx_dict, message="char dictionary")

    build_features(data, examples, flags[1], record_file, word2idx_dict,
                   char2idx_dict)
def toy_transformer(in_file, out_file):
    """
    distill original data into at most 15 topics, with each having at most 5 paragraphs,
    each of which has 5 questions and 5 answers
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = "toy"
    with open(in_file, "r") as fh:
        fancyprint(in_str=("Importing: " + in_file))
        source = json.load(fh)
        fancyprint(in_str="Converting into toy format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        topic_counter = 3
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            para_counter = 3
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"] = para["context"]
                paragraph["qas"] = []
                qa_counter = 3
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict["answer_start"] = answer[
                                "answer_start"]
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)

                    qa_counter -= 1
                    if qa_counter == 0:
                        break

                topic_dict["paragraphs"].append(paragraph)
                para_counter -= 1
                if para_counter == 0:
                    break

            new_data["data"].append(topic_dict)

            topic_counter -= 1
            if topic_counter == 0:
                break

    save(filename=out_file, obj=new_data, message="saving toy data")
def pre_process(args, logger):
    # Process training set and use it to decide on the word/character vocabularies

    word_counter, char_counter = Counter(), Counter()
    examples, eval_obj, topic_contexts_examples = process_file(
        filename=args.train_data_exp2,
        data_type="train",
        word_counter=word_counter,
        char_counter=char_counter,
        logger=logger,
        chunk_size=args.chunk_size)

    save(args.train_eval_file, eval_obj)
    del eval_obj

    word_emb_mat, word2idx_dict = get_embedding(
        word_counter,
        'word',
        emb_file=args.glove_word_file,
        vec_size=args.glove_word_dim,
        num_vectors=args.glove_word_num_vecs)
    char_emb_mat, char2idx_dict = get_embedding(char_counter,
                                                'char',
                                                emb_file=None,
                                                vec_size=args.glove_char_dim)

    save(args.word_emb_file, word_emb_mat)
    save(args.char_emb_file, char_emb_mat)
    del word_emb_mat
    del char_emb_mat

    save(args.word2idx_file, word2idx_dict)
    save(args.char2idx_file, char2idx_dict)

    build_features(args=args,
                   examples=examples,
                   topic_contexts=topic_contexts_examples,
                   data_type="train",
                   out_file=args.train_record_file_exp2,
                   word2idx_dict=word2idx_dict,
                   char2idx_dict=char2idx_dict,
                   exp2_topic_contexts_file=args.exp2_train_topic_contexts,
                   is_test=False,
                   chunk_size=args.chunk_size)
    del topic_contexts_examples
    del examples

    # Process dev and test sets
    dev_examples, dev_eval, dev_topic_contexts = process_file(
        filename=args.dev_data_exp2,
        data_type="dev",
        word_counter=word_counter,
        char_counter=char_counter,
        logger=logger,
        chunk_size=args.chunk_size)
    dev_meta = build_features(
        args=args,
        examples=dev_examples,
        topic_contexts=dev_topic_contexts,
        data_type="dev",
        out_file=args.dev_record_file_exp2,
        word2idx_dict=word2idx_dict,
        char2idx_dict=char2idx_dict,
        exp2_topic_contexts_file=args.exp2_dev_topic_contexts,
        chunk_size=args.chunk_size)
    del dev_topic_contexts
    del dev_examples

    save(args.dev_eval_file, dev_eval)
    save(args.dev_meta_file, dev_meta)