def convert_to_squad_format(qa_json_file, squad_file): qa_json = dataset_utils.read_triviaqa_data(qa_json_file) qad_triples = get_qad_triples(qa_json) random.seed(args.seed) random.shuffle(qad_triples) data = [] for qad in tqdm(qad_triples): qid = qad['QuestionId'] text = get_text(qad, qad['Source']) selected_text = select_relevant_portion(text) question = qad['Question'] para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]} data.append({'paragraphs': [para]}) qa = para['qas'][0] qa['id'] = dataset_utils.get_question_doc_string(qid, qad['Filename']) qa['qid'] = qid ans_string, index = dataset_utils.answer_index_in_document(qad['Answer'], selected_text) if index == -1 and qa_json['Split'] == 'train': data.pop() continue else: qa['answers'].append({'text': ans_string, 'answer_start': index}) if qa_json['Split'] == 'train' and len(data) >= args.sample_size and qa_json['Domain'] == 'Web': break squad = {'data': data, 'version': qa_json['Version']} utils.write_json_to_file(squad, squad_file) print ('Added', len(data))
learning_rate = 0.001 print "Loading embedding matrix.." if args.embed_mat_path is not None: embed_mat = numpy.load(args.embed_mat_path) else: embed_mat = get_embed_mat(args.glovefile, args.id2word) args.char_vocab_size = char_vocab_size args.embed_mat = numpy.load(args.embed_mat_path) print "Loading Dev data..." textdatapath = args.dev_json processed_data = args.tok_dev_json datafile = args.indexed_dev_json dev_data = load_data(datafile) dataset = dataset_utils.read_triviaqa_data(args.raw_dev_json) print "Loading verified Dev data" v_textdatapath = args.verified_dev_json v_processed_data = args.verified_tok_dev_json v_datafile = args.verified_indexed_dev_json v_dev_data = load_data(v_datafile) v_dataset = dataset_utils.read_triviaqa_data(args.verified_raw_dev_json) # print "Loading Test data" # t_textdatapath = args.test_json # t_processed_data = args.tok_test_json # t_datafile = args.indexed_test_json # t_dev_data = load_data(t_datafile) # t_dataset = dataset_utils.read_triviaqa_data(args.raw_test_json)
embed_mat = numpy.load(args.embed_mat_path) args.embed_mat = embed_mat args.char_vocab_size = char_vocab_size # -----------------------------------------------------------------------------# logging.info("Training data loading...") train_data = load_data(args.indexed_train_json) ##Testing logging.info("Loading dev data") textdatapath = args.dev_json processed_data = args.tok_dev_json datafile = args.indexed_dev_json dev_data = load_data(datafile) # dataset = load_data(textdatapath)['data'] dataset = dataset_utils.read_triviaqa_data(args.raw_dev_json) key_to_ground_truth = dataset_utils.get_key_to_ground_truth(dataset) # -----------------------------------------------------------------------------# # print "Number of Training Samples: ", len(train_data) # print "Number of Dev samples: ", len(dev_data) if os.path.isfile(baseexp + '/results.txt'): mode = 'a' else: mode = 'w' with open(baseexp + '/results.txt', mode) as fp: fp.write("######RESULTS######\n") # Initializations for tracking the best model prev_best_em = 0.0 prev_best_f1 = 0.0