def exp2_transformer(in_file, out_file, logger): new_data = {} new_data["experiment"] = 1 counter = 0 with open(in_file, "r") as fh: logger.info(f"Importing {fh.name}") source = json.load(fh) new_data["version"] = source["version"] new_data["data"] = [] logger.info("Creating all context list") for topic_id, topic in tqdm(enumerate(source["data"])): context_buffer = 0 topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["qas"] = [] topic_contexts = [] # topic_contexts = [para["context"] for para in topic["paragraphs"]] num_pars = 0 for para in topic["paragraphs"]: if num_pars >= 10: break num_pars += 1 topic_contexts.append(para['context']) paragraph = {} paragraph["qas"] = [] for qas in para['qas']: counter += 1 qas_dict = {} qas_dict["topic_id"] = topic_id qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict[ "answer_start"] = context_buffer + answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) topic_dict["qas"].append(qas_dict) context_buffer += len(para['context']) + 1 topic_contexts = " ".join(topic_contexts) topic_dict["context"] = topic_contexts new_data["data"].append(topic_dict) logger.info(f"Processed {counter} question, answer pairs") logger.info(f"Saving to {out_file}") save(filename=out_file, obj=new_data)
def exp2_transformer(in_file, out_file): """ args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = 2 with open(in_file, "r") as fh: fancyprint(in_str=("Importing: " + in_file)) source = json.load(fh) fancyprint(in_str="Converting into experiment 2 format") new_data["version"] = source["version"] new_data["data"] = [] for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] # merge the contexts within each topic into a giant string # save the topic_context above the paragraphs topic_dict["topic_context"] = "".join([ quick_clean(raw_str=para["context"]) for para in topic["paragraphs"] ]) context_buffer = 0 topic_dict["qas"] = [] for para in topic["paragraphs"]: for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} # update the answer start index answer_dict["answer_start"] = answer[ "answer_start"] + context_buffer answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) topic_dict["qas"].append(qas_dict) context_buffer += len(para["context"]) new_data["data"].append(topic_dict) save(filename=out_file, obj=new_data, message="saving experiment 2 data")
def exp_1_transformer(in_file, out_file, logger): new_data = {} new_data["experiment"] = 1 counter = 0 with open(in_file, "r") as fh: logger.info(f"Importing {fh.name}") source = json.load(fh) new_data["version"] = source["version"] new_data["data"] = [] logger.info("Creating all context list") all_contexts = [ para["context"] for topic in source["data"] for para in topic["paragraphs"] ] for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] for para in topic["paragraphs"]: paragraph = {} paragraph["context"], context_buffer = get_new_context( orig_context=para["context"], all_contexts=all_contexts) paragraph["qas"] = [] for qas in para['qas']: counter += 1 qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict[ "answer_start"] = context_buffer + answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) topic_dict["paragraphs"].append(paragraph) new_data["data"].append(topic_dict) logger.info(f"Processed {counter} question, answer pairs") logger.info(f"Saving to {out_file}") save(filename=out_file, obj=new_data)
def toy_transformer(in_file, train_file, dev_file, test_file, train_topic_num, dev_topic_num, test_topic_num, logger): """ distill original data into at most 15 topics, with each having at most 5 paragraphs, each of which has 5 questions and 5 answers args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = "toy" new_dev_data = {} new_dev_data['experiment'] = "toy_dev" new_test_data = {} new_test_data['experiment'] = "toy_train" with open(in_file, "r") as fh: logger.info(f"Importing: {in_file}") source = json.load(fh) logger.info("Converting into toy format") new_data["version"] = source["version"] new_data["data"] = [] new_dev_data["version"] = source["version"] new_dev_data["data"] = [] new_test_data["version"] = source["version"] new_test_data["data"] = [] topic_counter = train_topic_num for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] for para in topic["paragraphs"]: paragraph = {} paragraph["context"] = para["context"] paragraph["qas"] = [] for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict["answer_start"] = answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) topic_dict["paragraphs"].append(paragraph) if topic_counter >= 0: new_data["data"].append(topic_dict) elif topic_counter >= -1 * dev_topic_num: new_dev_data["data"].append(topic_dict) elif topic_counter >= -1 * (dev_topic_num + test_topic_num): new_test_data["data"].append(topic_dict) else: break topic_counter -= 1 logger.info(f"Saving new data to {train_file}") save(filename=train_file, obj=new_data) logger.info(f"Saving new dev data to {dev_file}") save(filename=dev_file, obj=new_dev_data) logger.info(f"Saving new test data to {test_file}") save(filename=test_file, obj=new_test_data)
def exp3_transformer(in_file_1, in_file_2, train_out_file, test_out_file, logger): """ convert data into (question, topic_id, topic title) format args: - in_file: the file name of the data to be transformed to experiment 3 format - out_file: the file name of where the ought to be written return: none, the data is written to an output """ train_new_data = {} train_new_data["experiment"] = "exp3" test_new_data = {} test_new_data["experiment"] = "exp3" q_count = 0 with open(in_file_1, "r") as fh: logger.info(f"Importing: {in_file_1}") source = json.load(fh) train_new_data["version"] = source["version"] train_new_data["data"] = [] test_new_data["version"] = source["version"] test_new_data["data"] = [] for topic_id, topic in tqdm(enumerate(source["data"])): test_count = 0 for para in topic["paragraphs"]: for qas in para['qas']: if test_count < 5: test_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) test_count += 1 else: train_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) q_count += 1 topic_id = len(source['data']) with open(in_file_2, "r") as fh: logger.info(f"Importing: {in_file_2}") source = json.load(fh) for topic in tqdm(source["data"]): test_count = 0 for para in topic["paragraphs"]: for qas in para['qas']: if test_count < 5: test_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) test_count += 1 else: train_new_data["data"].append( (quick_clean(raw_str=qas["question"]), topic_id, topic["title"])) q_count += 1 topic_id += 1 # print('tests, should all be true') # print(topic_id) # print(len(train_new_data)) # print(len(test_new_data)) # print(len(train_new_data) == len(test_new_data)) # print(train_new_data.keys() == test_new_data.keys()) # print(len(train_new_data.keys()) == len(set(train_new_data.keys()))) # print(len(test_new_data.keys()) == len(set(test_new_data.keys()))) logger.info(f"Saving new train data to {train_out_file}") save(filename=train_out_file, obj=train_new_data) logger.info(f"Saving new test data to {test_out_file}") save(filename=test_out_file, obj=test_new_data)
}, f"./data/torch-test-{data_type}") logger.info(f"Built and saved {total_}/{total} fully featurized examples") if __name__ == "__main__": global nlp args = get_exp3_featurize_args() log = get_logger(log_dir=args.logging_dir, name="data-gen") nlp = spacy.blank("en") word_counter = Counter() examples, eval_examples, topic_title_id_map = pre_process( args=args, in_file=args.train_in_file, word_counter=word_counter, logger=log) save(filename=args.train_topic_title_id_map_file, obj=topic_title_id_map) save(filename=args.train_eval_file, obj=eval_examples) word_emb_mat, word2idx_dict = get_word_embedding(args=args, counter=word_counter, logger=log) save(args.word_emb_file, word_emb_mat) dev_examples, dev_eval_examples, dev_topic_title_id_map = pre_process( args=args, in_file=args.dev_in_file, word_counter=word_counter, logger=log) save(filename=args.dev_topic_title_id_map_file, obj=dev_topic_title_id_map) save(filename=args.dev_eval_file, obj=dev_eval_examples)
def pre_process(args, logger): # Process training set and use it to decide on the word/character vocabularies word_counter, char_counter = Counter(), Counter() examples, eval_obj = process_file(filename=args.train_data_exp1, data_type="train", word_counter=word_counter, char_counter=char_counter, chunk_size=args.chunk_size) save(args.train_eval_file, eval_obj) del eval_obj word_emb_mat, word2idx_dict = get_embedding( counter=word_counter, data_type='word', emb_file=args.glove_word_file, vec_size=args.glove_word_dim, num_vectors=args.glove_word_num_vecs) char_emb_mat, char2idx_dict = get_embedding(counter=char_counter, data_type='char', emb_file=None, vec_size=args.glove_char_dim) save(args.word_emb_file, word_emb_mat) save(args.char_emb_file, char_emb_mat) del word_emb_mat del char_emb_mat dev_examples, dev_eval = process_file_dev(filename=args.dev_data_exp1, data_type="dev", word_counter=word_counter, char_counter=char_counter) build_features(args=args, examples=examples, data_type="train", out_file=args.train_record_file_exp1, word2idx_dict=word2idx_dict, char2idx_dict=char2idx_dict, is_test=False, chunk_size=args.chunk_size) del examples # Process dev and test sets dev_meta = build_features_dev(args=args, examples=dev_examples, data_type="dev", out_file=args.dev_record_file_exp1, word2idx_dict=word2idx_dict, char2idx_dict=char2idx_dict, is_test=False) save(args.dev_meta_file, dev_meta) save(args.dev_eval_file, dev_eval) del dev_meta del dev_eval # test_examples, test_eval = process_file(filename=args.test_data_exp1, # data_type="test", # word_counter=word_counter, # char_counter=char_counter, # logger=logger) # test_meta = build_features(args=args, examples=test_examples, data_type="test", # out_file=args.test_record_file_exp1, word2idx_dict=word2idx_dict, # char2idx_dict=char2idx_dict, is_test=True) # save(args.test_meta_file, test_meta) # save(args.test_eval_file, test_eval) save(args.word2idx_file, word2idx_dict) save(args.char2idx_file, char2idx_dict)
def pre_process(data, flags): """ authors: @rohitmusti @chrischute """ if flags[1] == "dev": exp3_data = data.dev_data_exp3 eval_file = data.dev_eval_exp3 record_file = data.dev_record_file_exp3 elif flags[1] == "train": exp3_data = data.train_data_exp3 eval_file = data.train_eval_exp3 record_file = data.train_record_file_exp3 elif flags[1] == "toy": exp3_data = data.toy_data_exp3 eval_file = data.toy_eval_exp3 record_file = data.toy_record_file_exp3 else: print("Small error: no valid flags were passed in") print("Valid flags: dev, train, toy") # Process training set and use it to decide on the word/character vocabularies word_counter, char_counter = Counter(), Counter() examples, eval_obj = process_file(exp3_data, flags[1], word_counter, char_counter) save(eval_file, eval_obj, message=(flags[1] + " eval")) word_emb_mat, word2idx_dict = get_embedding( word_counter, 'word', emb_file=data.glove_word_file, vec_size=data.glove_word_dim, num_vectors=data.glove_word_num_vecs) char_emb_mat, char2idx_dict = get_embedding(char_counter, 'char', emb_file=data.glove_char_file, vec_size=data.char_emb_size) if flags[1] == "train": save(data.word_emb_file, word_emb_mat, message="word embedding") save(data.char_emb_file, char_emb_mat, message="char embedding") save(data.word2idx_file, word2idx_dict, message="word dictionary") save(data.char2idx_file, char2idx_dict, message="char dictionary") elif flags[1] == "toy": save(data.toy_word_emb_file, word_emb_mat, message="word embedding") save(data.toy_char_emb_file, char_emb_mat, message="char embedding") save(data.toy_word2idx_file, word2idx_dict, message="word dictionary") save(data.toy_char2idx_file, char2idx_dict, message="char dictionary") build_features(data, examples, flags[1], record_file, word2idx_dict, char2idx_dict)
def toy_transformer(in_file, out_file): """ distill original data into at most 15 topics, with each having at most 5 paragraphs, each of which has 5 questions and 5 answers args: - in_file: the file name of the data to be transformed to experiment 2 - out_file: the file name of where the ought to be written return: none, the data is written to an output """ new_data = {} new_data['experiment'] = "toy" with open(in_file, "r") as fh: fancyprint(in_str=("Importing: " + in_file)) source = json.load(fh) fancyprint(in_str="Converting into toy format") new_data["version"] = source["version"] new_data["data"] = [] topic_counter = 3 for topic in tqdm(source["data"]): topic_dict = {} topic_dict["title"] = topic["title"] topic_dict["paragraphs"] = [] para_counter = 3 for para in topic["paragraphs"]: paragraph = {} paragraph["context"] = para["context"] paragraph["qas"] = [] qa_counter = 3 for qas in para['qas']: qas_dict = {} qas_dict["id"] = qas["id"] qas_dict["is_impossible"] = qas["is_impossible"] qas_dict["question"] = quick_clean(raw_str=qas["question"]) qas_dict["answers"] = [] if not qas["is_impossible"]: for answer in qas["answers"]: answer_dict = {} answer_dict["answer_start"] = answer[ "answer_start"] answer_dict["text"] = answer["text"] qas_dict["answers"].append(answer_dict) paragraph["qas"].append(qas_dict) qa_counter -= 1 if qa_counter == 0: break topic_dict["paragraphs"].append(paragraph) para_counter -= 1 if para_counter == 0: break new_data["data"].append(topic_dict) topic_counter -= 1 if topic_counter == 0: break save(filename=out_file, obj=new_data, message="saving toy data")
def pre_process(args, logger): # Process training set and use it to decide on the word/character vocabularies word_counter, char_counter = Counter(), Counter() examples, eval_obj, topic_contexts_examples = process_file( filename=args.train_data_exp2, data_type="train", word_counter=word_counter, char_counter=char_counter, logger=logger, chunk_size=args.chunk_size) save(args.train_eval_file, eval_obj) del eval_obj word_emb_mat, word2idx_dict = get_embedding( word_counter, 'word', emb_file=args.glove_word_file, vec_size=args.glove_word_dim, num_vectors=args.glove_word_num_vecs) char_emb_mat, char2idx_dict = get_embedding(char_counter, 'char', emb_file=None, vec_size=args.glove_char_dim) save(args.word_emb_file, word_emb_mat) save(args.char_emb_file, char_emb_mat) del word_emb_mat del char_emb_mat save(args.word2idx_file, word2idx_dict) save(args.char2idx_file, char2idx_dict) build_features(args=args, examples=examples, topic_contexts=topic_contexts_examples, data_type="train", out_file=args.train_record_file_exp2, word2idx_dict=word2idx_dict, char2idx_dict=char2idx_dict, exp2_topic_contexts_file=args.exp2_train_topic_contexts, is_test=False, chunk_size=args.chunk_size) del topic_contexts_examples del examples # Process dev and test sets dev_examples, dev_eval, dev_topic_contexts = process_file( filename=args.dev_data_exp2, data_type="dev", word_counter=word_counter, char_counter=char_counter, logger=logger, chunk_size=args.chunk_size) dev_meta = build_features( args=args, examples=dev_examples, topic_contexts=dev_topic_contexts, data_type="dev", out_file=args.dev_record_file_exp2, word2idx_dict=word2idx_dict, char2idx_dict=char2idx_dict, exp2_topic_contexts_file=args.exp2_dev_topic_contexts, chunk_size=args.chunk_size) del dev_topic_contexts del dev_examples save(args.dev_eval_file, dev_eval) save(args.dev_meta_file, dev_meta)