def exp2_transformer(in_file, out_file): """ args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = 2 with open(in_file, "r") as fh: fancyprint(in_str=("Importing: " + in_file)) source = json.load(fh) fancyprint(in_str="Converting into experiment 2 format") new_data["version"] = source["version"] new_data["data"] = [] for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] # merge the contexts within each topic into a giant string # save the topic_context above the paragraphs topic_dict["topic_context"] = "".join([ quick_clean(raw_str=para["context"]) for para in topic["paragraphs"] ]) context_buffer = 0 topic_dict["qas"] = [] for para in topic["paragraphs"]: for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} # update the answer start index answer_dict["answer_start"] = answer[ "answer_start"] + context_buffer answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) topic_dict["qas"].append(qas_dict) context_buffer += len(para["context"]) new_data["data"].append(topic_dict) save(filename=out_file, obj=new_data, message="saving experiment 2 data")
def exp2_transformer(in_file, out_file, logger): new_data = {} new_data["experiment"] = 1 counter = 0 with open(in_file, "r") as fh: logger.info(f"Importing {fh.name}") source = json.load(fh) new_data["version"] = source["version"] new_data["data"] = [] logger.info("Creating all context list") for topic_id, topic in tqdm(enumerate(source["data"])): context_buffer = 0 topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["qas"] = [] topic_contexts = [] # topic_contexts = [para["context"] for para in topic["paragraphs"]] num_pars = 0 for para in topic["paragraphs"]: if num_pars >= 10: break num_pars += 1 topic_contexts.append(para['context']) paragraph = {} paragraph["qas"] = [] for qas in para['qas']: counter += 1 qas_dict = {} qas_dict["topic_id"] = topic_id qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict[ "answer_start"] = context_buffer + answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) topic_dict["qas"].append(qas_dict) context_buffer += len(para['context']) + 1 topic_contexts = " ".join(topic_contexts) topic_dict["context"] = topic_contexts new_data["data"].append(topic_dict) logger.info(f"Processed {counter} question, answer pairs") logger.info(f"Saving to {out_file}") save(filename=out_file, obj=new_data)
def exp_1_transformer(in_file, out_file, logger): new_data = {} new_data["experiment"] = 1 counter = 0 with open(in_file, "r") as fh: logger.info(f"Importing {fh.name}") source = json.load(fh) new_data["version"] = source["version"] new_data["data"] = [] logger.info("Creating all context list") all_contexts = [ para["context"] for topic in source["data"] for para in topic["paragraphs"] ] for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] for para in topic["paragraphs"]: paragraph = {} paragraph["context"], context_buffer = get_new_context( orig_context=para["context"], all_contexts=all_contexts) paragraph["qas"] = [] for qas in para['qas']: counter += 1 qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict[ "answer_start"] = context_buffer + answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) topic_dict["paragraphs"].append(paragraph) new_data["data"].append(topic_dict) logger.info(f"Processed {counter} question, answer pairs") logger.info(f"Saving to {out_file}") save(filename=out_file, obj=new_data)
def toy_transformer(in_file, train_file, dev_file, test_file, train_topic_num, dev_topic_num, test_topic_num, logger): """ distill original data into at most 15 topics, with each having at most 5 paragraphs, each of which has 5 questions and 5 answers args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = "toy" new_dev_data = {} new_dev_data['experiment'] = "toy_dev" new_test_data = {} new_test_data['experiment'] = "toy_train" with open(in_file, "r") as fh: logger.info(f"Importing: {in_file}") source = json.load(fh) logger.info("Converting into toy format") new_data["version"] = source["version"] new_data["data"] = [] new_dev_data["version"] = source["version"] new_dev_data["data"] = [] new_test_data["version"] = source["version"] new_test_data["data"] = [] topic_counter = train_topic_num for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] for para in topic["paragraphs"]: paragraph = {} paragraph["context"] = para["context"] paragraph["qas"] = [] for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict["answer_start"] = answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) topic_dict["paragraphs"].append(paragraph) if topic_counter >= 0: new_data["data"].append(topic_dict) elif topic_counter >= -1 * dev_topic_num: new_dev_data["data"].append(topic_dict) elif topic_counter >= -1 * (dev_topic_num + test_topic_num): new_test_data["data"].append(topic_dict) else: break topic_counter -= 1 logger.info(f"Saving new data to {train_file}") save(filename=train_file, obj=new_data) logger.info(f"Saving new dev data to {dev_file}") save(filename=dev_file, obj=new_dev_data) logger.info(f"Saving new test data to {test_file}") save(filename=test_file, obj=new_test_data)
def exp3_transformer(in_file_1, in_file_2, train_out_file, test_out_file, logger): """ convert data into (question, topic_id, topic title) format args: - in_file: the file name of the data to be transformed to experiment 3 format - out_file: the file name of where the ought to be written return: none, the data is written to an output """ train_new_data = {} train_new_data["experiment"] = "exp3" test_new_data = {} test_new_data["experiment"] = "exp3" q_count = 0 with open(in_file_1, "r") as fh: logger.info(f"Importing: {in_file_1}") source = json.load(fh) train_new_data["version"] = source["version"] train_new_data["data"] = [] test_new_data["version"] = source["version"] test_new_data["data"] = [] for topic_id, topic in tqdm(enumerate(source["data"])): test_count = 0 for para in topic["paragraphs"]: for qas in para['qas']: if test_count < 5: test_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) test_count += 1 else: train_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) q_count += 1 topic_id = len(source['data']) with open(in_file_2, "r") as fh: logger.info(f"Importing: {in_file_2}") source = json.load(fh) for topic in tqdm(source["data"]): test_count = 0 for para in topic["paragraphs"]: for qas in para['qas']: if test_count < 5: test_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) test_count += 1 else: train_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) q_count += 1 topic_id += 1 # print('tests, should all be true') # print(topic_id) # print(len(train_new_data)) # print(len(test_new_data)) # print(len(train_new_data) == len(test_new_data)) # print(train_new_data.keys() == test_new_data.keys()) # print(len(train_new_data.keys()) == len(set(train_new_data.keys()))) # print(len(test_new_data.keys()) == len(set(test_new_data.keys()))) logger.info(f"Saving new train data to {train_out_file}") save(filename=train_out_file, obj=train_new_data) logger.info(f"Saving new test data to {test_out_file}") save(filename=test_out_file, obj=test_new_data)
def process_file(filename, data_type, word_counter, char_counter): """ modified to fit the super context experiment author: @rohitmusti """ examples = [] eval_examples = {} total = 0 with open(filename, "r") as fh: source = json.load(fh) sc = source["super_context"] sc_tokens = word_tokenize(sc) sc_chars = [list(token) for token in sc_tokens] sc_spans = convert_idx(sc, sc_tokens) print("Creating the word indices from the contexts") for token in tqdm(sc_tokens): word_counter[token] += 1 # I changed this to 1 instead of the len(para["qas"]) it was originally. for char in token: char_counter[char] += 1 # I changed this to 1 instead of the len(para["qas"]) it was originally. examples.append({ "super_context_tokens": sc_tokens, "super_context_chars": sc_chars, "spans": sc_spans }) print() print("Pre-processing {} examples...".format(data_type)) for topic in tqdm(source["data"]): for qas in topic["qas"]: total += 1 ques = quick_clean(qas["question"]) ques_tokens = word_tokenize(ques) ques_chars = [list(token) for token in ques_tokens] for token in ques_tokens: word_counter[token] += 1 for char in token: char_counter[char] += 1 y1s, y2s = [], [] answer_texts = [] for answer in qas["answers"]: answer_text = answer["text"] answer_start = answer["answer_start"] answer_end = answer_start + len(answer_text) answer_texts.append(answer_text) answer_span = [] if not qas["is_impossible"]: [ answer_span.append(idx) for idx, span in enumerate(sc_spans) ] # for idx, span in enumerate(sc_spans): # if not (answer_end <= span[0] or answer_start >= span[1]): # I may come to regret this comment but I think it is useful for now # hopefully the is_impossible check helps # answer_span.append(idx) y1, y2 = answer_span[0], answer_span[-1] else: y1, y2 = -1, -1 #signifying no answer y1s.append(y1) y2s.append(y2) example = { "ques_tokens": ques_tokens, "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total } examples.append(example) eval_examples[str(total)] = { "question": ques, "answers": answer_texts, "uuid": qas["id"] } print("{} questions in total".format(len(examples))) return examples, eval_examples
def toy_transformer(in_file, out_file): """ distill original data into at most 15 topics, with each having at most 5 paragraphs, each of which has 5 questions and 5 answers args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = "toy" with open(in_file, "r") as fh: fancyprint(in_str=("Importing: " + in_file)) source = json.load(fh) fancyprint(in_str="Converting into toy format") new_data["version"] = source["version"] new_data["data"] = [] topic_counter = 3 for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] para_counter = 3 for para in topic["paragraphs"]: paragraph = {} paragraph["context"] = para["context"] paragraph["qas"] = [] qa_counter = 3 for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict["answer_start"] = answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) qa_counter -= 1 if qa_counter == 0: break topic_dict["paragraphs"].append(paragraph) para_counter -= 1 if para_counter == 0: break new_data["data"].append(topic_dict) topic_counter -= 1 if topic_counter == 0: break save(filename=out_file, obj=new_data, message="saving toy data")
def process_file(filename, data_type, word_counter, char_counter, logger, chunk_size): logger.info(f"Pre-processing {data_type} examples...") ret_examples = [] ret_eval_examples = {} examples = [] eval_examples = {} topic_context_examples = [] total = 0 with open(filename, "r") as fh: source = json.load(fh) chunk_tracker = chunk_size for topic_id, topic in tqdm(enumerate(source["data"])): chunk_tracker -= 1 # context processing topic_context = quick_clean(raw_str=topic["context"]) topic_context_tokens = word_tokenize(topic_context) topic_context_chars = [ list(token) for token in topic_context_tokens ] spans = convert_idx(topic_context, topic_context_tokens) for token in topic_context_tokens: # it was originally len(para['qas']) but that seemed arbitrary so I used 1s word_counter[token] += 1 for char in token: char_counter[char] += 1 topic_context_dict = { "context_tokens": topic_context_tokens, "context_chars": topic_context_chars, "context": topic_context } topic_context_examples.append(topic_context_dict) # qas processing for qa in topic["qas"]: total += 1 #question processing ques = quick_clean(qa["question"]) ques_tokens = word_tokenize(ques) ques_chars = [list(token) for token in ques_tokens] for token in ques_tokens: word_counter[token] += 1 for char in token: char_counter[char] += 1 # answer processing y1s, y2s = [], [] answer_texts = [] for answer in qa["answers"]: answer_text = answer["text"] answer_start = answer['answer_start'] answer_end = answer_start + len(answer_text) answer_texts.append(answer_text) answer_span = [] for idx, span in enumerate(spans): if not (answer_end <= span[0] or answer_start >= span[1]): answer_span.append(idx) y1, y2 = answer_span[0], answer_span[-1] y1s.append(y1) y2s.append(y2) example = { "ques_tokens": ques_tokens, "ques_chars": ques_chars, "topic_context_id": topic_id, "y1s": y1s, "y2s": y2s, "id": total } examples.append(example) eval_examples[str(total)] = { "question": ques, "context": topic_context_examples[topic_id]["context"], "spans": spans, "answers": answer_texts, "uuid": qa["id"] } if chunk_tracker == 0 or topic_id == (len(source['data']) - 1): # print(f"creating chunk b/c {chunk_tracker == 0} or {n == (len(source['data'])-1)}") # print(f"number of examples is {chunk_size - chunk_tracker}") ret_examples.append(examples) examples = [] chunk_tracker = chunk_size return ret_examples, eval_examples, topic_context_examples