Esempio n. 1
0
def convert_to_squad_format(qa_json_file, squad_file):
    qa_json = dataset_utils.read_triviaqa_data(qa_json_file)
    qad_triples = get_qad_triples(qa_json)

    random.seed(args.seed)
    random.shuffle(qad_triples)

    data = []
    for qad in tqdm(qad_triples):
        qid = qad['QuestionId']

        text = get_text(qad, qad['Source'])
        selected_text = select_relevant_portion(text)

        question = qad['Question']
        para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]}
        data.append({'paragraphs': [para]})
        qa = para['qas'][0]
        qa['id'] = dataset_utils.get_question_doc_string(qid, qad['Filename'])
        qa['qid'] = qid

        ans_string, index = dataset_utils.answer_index_in_document(qad['Answer'], selected_text)
        if index == -1 and qa_json['Split'] == 'train':
            data.pop()
            continue
        else:
            qa['answers'].append({'text': ans_string, 'answer_start': index})

        if qa_json['Split'] == 'train' and len(data) >= args.sample_size and qa_json['Domain'] == 'Web':
            break

    squad = {'data': data, 'version': qa_json['Version']}
    utils.write_json_to_file(squad, squad_file)
    print ('Added', len(data))
Esempio n. 2
0
	learning_rate = 0.001
	print "Loading embedding matrix.."
	if args.embed_mat_path is not None:
		embed_mat = numpy.load(args.embed_mat_path)
	else:
		embed_mat = get_embed_mat(args.glovefile,
					  args.id2word)
	args.char_vocab_size = char_vocab_size
	args.embed_mat = numpy.load(args.embed_mat_path)

	print "Loading Dev data..."
	textdatapath = args.dev_json
	processed_data = args.tok_dev_json
	datafile = args.indexed_dev_json
	dev_data = load_data(datafile)
	dataset = dataset_utils.read_triviaqa_data(args.raw_dev_json)

	print "Loading verified Dev data"
	v_textdatapath = args.verified_dev_json
	v_processed_data = args.verified_tok_dev_json
	v_datafile = args.verified_indexed_dev_json
	v_dev_data = load_data(v_datafile)
	v_dataset = dataset_utils.read_triviaqa_data(args.verified_raw_dev_json)

	# print "Loading Test data"
	# t_textdatapath = args.test_json
	# t_processed_data = args.tok_test_json
	# t_datafile = args.indexed_test_json
	# t_dev_data = load_data(t_datafile)
	# t_dataset = dataset_utils.read_triviaqa_data(args.raw_test_json)
Esempio n. 3
0
    embed_mat = numpy.load(args.embed_mat_path)
    args.embed_mat = embed_mat
    args.char_vocab_size = char_vocab_size
    # -----------------------------------------------------------------------------#

    logging.info("Training data loading...")
    train_data = load_data(args.indexed_train_json)

    ##Testing
    logging.info("Loading dev data")
    textdatapath = args.dev_json
    processed_data = args.tok_dev_json
    datafile = args.indexed_dev_json
    dev_data = load_data(datafile)
    # dataset = load_data(textdatapath)['data']
    dataset = dataset_utils.read_triviaqa_data(args.raw_dev_json)
    key_to_ground_truth = dataset_utils.get_key_to_ground_truth(dataset)
    # -----------------------------------------------------------------------------#
    # print "Number of Training Samples: ", len(train_data)
    # print "Number of Dev samples: ", len(dev_data)

    if os.path.isfile(baseexp + '/results.txt'):
        mode = 'a'
    else:
        mode = 'w'
    with open(baseexp + '/results.txt', mode) as fp:
        fp.write("######RESULTS######\n")

    # Initializations for tracking the best model
    prev_best_em = 0.0
    prev_best_f1 = 0.0