Exemple #1
0
def exp2_transformer(in_file, out_file):
    """
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = 2
    with open(in_file, "r") as fh:
        fancyprint(in_str=("Importing: " + in_file))
        source = json.load(fh)
        fancyprint(in_str="Converting into experiment 2 format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            # merge the contexts within each topic into a giant string
            # save the topic_context above the paragraphs
            topic_dict["topic_context"] = "".join([
                quick_clean(raw_str=para["context"])
                for para in topic["paragraphs"]
            ])
            context_buffer = 0
            topic_dict["qas"] = []
            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            # update the answer start index
                            answer_dict["answer_start"] = answer[
                                "answer_start"] + context_buffer
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    topic_dict["qas"].append(qas_dict)
                context_buffer += len(para["context"])
            new_data["data"].append(topic_dict)

    save(filename=out_file, obj=new_data, message="saving experiment 2 data")
Exemple #2
0
def exp2_transformer(in_file, out_file, logger):
    new_data = {}
    new_data["experiment"] = 1
    counter = 0
    with open(in_file, "r") as fh:
        logger.info(f"Importing {fh.name}")
        source = json.load(fh)
        new_data["version"] = source["version"]
        new_data["data"] = []
        logger.info("Creating all context list")
        for topic_id, topic in tqdm(enumerate(source["data"])):
            context_buffer = 0
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["qas"] = []
            topic_contexts = []
            # topic_contexts = [para["context"] for para in topic["paragraphs"]]
            num_pars = 0

            for para in topic["paragraphs"]:

                if num_pars >= 10:
                    break

                num_pars += 1
                topic_contexts.append(para['context'])
                paragraph = {}
                paragraph["qas"] = []
                for qas in para['qas']:
                    counter += 1
                    qas_dict = {}
                    qas_dict["topic_id"] = topic_id
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict[
                                "answer_start"] = context_buffer + answer[
                                    "answer_start"]
                            answer_dict["text"] = answer["text"]

                            qas_dict["answers"].append(answer_dict)
                    topic_dict["qas"].append(qas_dict)
                context_buffer += len(para['context']) + 1

            topic_contexts = " ".join(topic_contexts)
            topic_dict["context"] = topic_contexts

            new_data["data"].append(topic_dict)

    logger.info(f"Processed {counter} question, answer pairs")
    logger.info(f"Saving to {out_file}")
    save(filename=out_file, obj=new_data)
def exp_1_transformer(in_file, out_file, logger):
    new_data = {}
    new_data["experiment"] = 1
    counter = 0
    with open(in_file, "r") as fh:
        logger.info(f"Importing {fh.name}")
        source = json.load(fh)
        new_data["version"] = source["version"]
        new_data["data"] = []
        logger.info("Creating all context list")
        all_contexts = [
            para["context"] for topic in source["data"]
            for para in topic["paragraphs"]
        ]
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"], context_buffer = get_new_context(
                    orig_context=para["context"], all_contexts=all_contexts)
                paragraph["qas"] = []
                for qas in para['qas']:
                    counter += 1
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict[
                                "answer_start"] = context_buffer + answer[
                                    "answer_start"]
                            answer_dict["text"] = answer["text"]

                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)
                topic_dict["paragraphs"].append(paragraph)
            new_data["data"].append(topic_dict)

    logger.info(f"Processed {counter} question, answer pairs")
    logger.info(f"Saving to {out_file}")
    save(filename=out_file, obj=new_data)
def toy_transformer(in_file, train_file, dev_file, test_file, train_topic_num,
                    dev_topic_num, test_topic_num, logger):
    """
    distill original data into at most 15 topics, with each having at most 5 paragraphs,
    each of which has 5 questions and 5 answers
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = "toy"
    new_dev_data = {}
    new_dev_data['experiment'] = "toy_dev"
    new_test_data = {}
    new_test_data['experiment'] = "toy_train"
    with open(in_file, "r") as fh:
        logger.info(f"Importing: {in_file}")
        source = json.load(fh)
        logger.info("Converting into toy format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        new_dev_data["version"] = source["version"]
        new_dev_data["data"] = []
        new_test_data["version"] = source["version"]
        new_test_data["data"] = []
        topic_counter = train_topic_num
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"] = para["context"]
                paragraph["qas"] = []
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict["answer_start"] = answer[
                                "answer_start"]
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)
                topic_dict["paragraphs"].append(paragraph)

            if topic_counter >= 0:
                new_data["data"].append(topic_dict)
            elif topic_counter >= -1 * dev_topic_num:
                new_dev_data["data"].append(topic_dict)
            elif topic_counter >= -1 * (dev_topic_num + test_topic_num):
                new_test_data["data"].append(topic_dict)
            else:
                break

            topic_counter -= 1

    logger.info(f"Saving new data to {train_file}")
    save(filename=train_file, obj=new_data)
    logger.info(f"Saving new dev data to {dev_file}")
    save(filename=dev_file, obj=new_dev_data)
    logger.info(f"Saving new test data to {test_file}")
    save(filename=test_file, obj=new_test_data)
Exemple #5
0
def exp3_transformer(in_file_1, in_file_2, train_out_file, test_out_file,
                     logger):
    """
    convert data into (question, topic_id, topic title) format
    args:
        - in_file: the file name of the data to be transformed to experiment 3 format
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    train_new_data = {}
    train_new_data["experiment"] = "exp3"
    test_new_data = {}
    test_new_data["experiment"] = "exp3"

    q_count = 0

    with open(in_file_1, "r") as fh:
        logger.info(f"Importing: {in_file_1}")
        source = json.load(fh)
        train_new_data["version"] = source["version"]
        train_new_data["data"] = []
        test_new_data["version"] = source["version"]
        test_new_data["data"] = []
        for topic_id, topic in tqdm(enumerate(source["data"])):
            test_count = 0
            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    if test_count < 5:
                        test_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                        test_count += 1
                    else:
                        train_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                    q_count += 1
        topic_id = len(source['data'])

    with open(in_file_2, "r") as fh:
        logger.info(f"Importing: {in_file_2}")
        source = json.load(fh)
        for topic in tqdm(source["data"]):
            test_count = 0

            for para in topic["paragraphs"]:
                for qas in para['qas']:
                    if test_count < 5:
                        test_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                        test_count += 1
                    else:
                        train_new_data["data"].append(
                            (quick_clean(raw_str=qas["question"]), topic_id,
                             topic["title"]))
                    q_count += 1
            topic_id += 1


#    print('tests, should all be true')
#    print(topic_id)
#    print(len(train_new_data))
#    print(len(test_new_data))
#    print(len(train_new_data) == len(test_new_data))
#    print(train_new_data.keys() == test_new_data.keys())
#    print(len(train_new_data.keys()) == len(set(train_new_data.keys())))
#    print(len(test_new_data.keys()) == len(set(test_new_data.keys())))

    logger.info(f"Saving new train data to {train_out_file}")
    save(filename=train_out_file, obj=train_new_data)
    logger.info(f"Saving new test data to {test_out_file}")
    save(filename=test_out_file, obj=test_new_data)
def process_file(filename, data_type, word_counter, char_counter):
    """
    modified to fit the super context experiment
    author: @rohitmusti
    """
    examples = []
    eval_examples = {}
    total = 0
    with open(filename, "r") as fh:
        source = json.load(fh)
        sc = source["super_context"]
        sc_tokens = word_tokenize(sc)
        sc_chars = [list(token) for token in sc_tokens]
        sc_spans = convert_idx(sc, sc_tokens)
        print("Creating the word indices from the contexts")
        for token in tqdm(sc_tokens):
            word_counter[token] += 1
            # I changed this to 1 instead of the len(para["qas"]) it was originally.
            for char in token:
                char_counter[char] += 1
            # I changed this to 1 instead of the len(para["qas"]) it was originally.
        examples.append({
            "super_context_tokens": sc_tokens,
            "super_context_chars": sc_chars,
            "spans": sc_spans
        })
        print()
        print("Pre-processing {} examples...".format(data_type))
        for topic in tqdm(source["data"]):
            for qas in topic["qas"]:
                total += 1
                ques = quick_clean(qas["question"])
                ques_tokens = word_tokenize(ques)
                ques_chars = [list(token) for token in ques_tokens]
                for token in ques_tokens:
                    word_counter[token] += 1
                    for char in token:
                        char_counter[char] += 1
                y1s, y2s = [], []
                answer_texts = []
                for answer in qas["answers"]:
                    answer_text = answer["text"]
                    answer_start = answer["answer_start"]
                    answer_end = answer_start + len(answer_text)
                    answer_texts.append(answer_text)
                    answer_span = []
                    if not qas["is_impossible"]:
                        [
                            answer_span.append(idx)
                            for idx, span in enumerate(sc_spans)
                        ]
                        #                        for idx, span in enumerate(sc_spans):
                        #                            if not (answer_end <= span[0] or answer_start >= span[1]):
                        # I may come to regret this comment but I think it is useful for now
                        # hopefully the is_impossible check helps
                        #                            answer_span.append(idx)

                        y1, y2 = answer_span[0], answer_span[-1]
                    else:
                        y1, y2 = -1, -1  #signifying no answer
                    y1s.append(y1)
                    y2s.append(y2)

                example = {
                    "ques_tokens": ques_tokens,
                    "ques_chars": ques_chars,
                    "y1s": y1s,
                    "y2s": y2s,
                    "id": total
                }
                examples.append(example)
                eval_examples[str(total)] = {
                    "question": ques,
                    "answers": answer_texts,
                    "uuid": qas["id"]
                }

        print("{} questions in total".format(len(examples)))
    return examples, eval_examples
def toy_transformer(in_file, out_file):
    """
    distill original data into at most 15 topics, with each having at most 5 paragraphs,
    each of which has 5 questions and 5 answers
    args:
        - in_file: the file name of the data to be transformed to experiment 2
        - out_file: the file name of where the ought to be written

    return:
        none, the data is written to an output
    """
    new_data = {}
    new_data['experiment'] = "toy"
    with open(in_file, "r") as fh:
        fancyprint(in_str=("Importing: " + in_file))
        source = json.load(fh)
        fancyprint(in_str="Converting into toy format")
        new_data["version"] = source["version"]
        new_data["data"] = []
        topic_counter = 3
        for topic in tqdm(source["data"]):
            topic_dict = {}
            topic_dict["title"] = topic["title"]
            topic_dict["paragraphs"] = []
            para_counter = 3
            for para in topic["paragraphs"]:
                paragraph = {}
                paragraph["context"] = para["context"]
                paragraph["qas"] = []
                qa_counter = 3
                for qas in para['qas']:
                    qas_dict = {}
                    qas_dict["id"] = qas["id"]
                    qas_dict["is_impossible"] = qas["is_impossible"]
                    qas_dict["question"] = quick_clean(raw_str=qas["question"])
                    qas_dict["answers"] = []
                    if not qas["is_impossible"]:
                        for answer in qas["answers"]:
                            answer_dict = {}
                            answer_dict["answer_start"] = answer[
                                "answer_start"]
                            answer_dict["text"] = answer["text"]
                            qas_dict["answers"].append(answer_dict)
                    paragraph["qas"].append(qas_dict)

                    qa_counter -= 1
                    if qa_counter == 0:
                        break

                topic_dict["paragraphs"].append(paragraph)
                para_counter -= 1
                if para_counter == 0:
                    break

            new_data["data"].append(topic_dict)

            topic_counter -= 1
            if topic_counter == 0:
                break

    save(filename=out_file, obj=new_data, message="saving toy data")
def process_file(filename, data_type, word_counter, char_counter, logger,
                 chunk_size):
    logger.info(f"Pre-processing {data_type} examples...")
    ret_examples = []
    ret_eval_examples = {}
    examples = []
    eval_examples = {}
    topic_context_examples = []
    total = 0
    with open(filename, "r") as fh:
        source = json.load(fh)
        chunk_tracker = chunk_size
        for topic_id, topic in tqdm(enumerate(source["data"])):
            chunk_tracker -= 1
            # context processing
            topic_context = quick_clean(raw_str=topic["context"])
            topic_context_tokens = word_tokenize(topic_context)
            topic_context_chars = [
                list(token) for token in topic_context_tokens
            ]
            spans = convert_idx(topic_context, topic_context_tokens)
            for token in topic_context_tokens:
                # it was originally len(para['qas']) but that seemed arbitrary so I used 1s
                word_counter[token] += 1
                for char in token:
                    char_counter[char] += 1

            topic_context_dict = {
                "context_tokens": topic_context_tokens,
                "context_chars": topic_context_chars,
                "context": topic_context
            }
            topic_context_examples.append(topic_context_dict)

            # qas processing
            for qa in topic["qas"]:
                total += 1

                #question processing
                ques = quick_clean(qa["question"])
                ques_tokens = word_tokenize(ques)
                ques_chars = [list(token) for token in ques_tokens]
                for token in ques_tokens:
                    word_counter[token] += 1
                    for char in token:
                        char_counter[char] += 1

                # answer processing
                y1s, y2s = [], []
                answer_texts = []
                for answer in qa["answers"]:
                    answer_text = answer["text"]
                    answer_start = answer['answer_start']
                    answer_end = answer_start + len(answer_text)
                    answer_texts.append(answer_text)
                    answer_span = []
                    for idx, span in enumerate(spans):
                        if not (answer_end <= span[0]
                                or answer_start >= span[1]):
                            answer_span.append(idx)
                    y1, y2 = answer_span[0], answer_span[-1]
                    y1s.append(y1)
                    y2s.append(y2)
                example = {
                    "ques_tokens": ques_tokens,
                    "ques_chars": ques_chars,
                    "topic_context_id": topic_id,
                    "y1s": y1s,
                    "y2s": y2s,
                    "id": total
                }
                examples.append(example)
                eval_examples[str(total)] = {
                    "question": ques,
                    "context": topic_context_examples[topic_id]["context"],
                    "spans": spans,
                    "answers": answer_texts,
                    "uuid": qa["id"]
                }
            if chunk_tracker == 0 or topic_id == (len(source['data']) - 1):
                #                print(f"creating chunk b/c {chunk_tracker == 0} or {n == (len(source['data'])-1)}")
                #                print(f"number of examples is {chunk_size - chunk_tracker}")
                ret_examples.append(examples)
                examples = []
                chunk_tracker = chunk_size

    return ret_examples, eval_examples, topic_context_examples