def get_batch(index, contexts, start_token): "make ICT batch data" sentence = [contexts[i]["sentence"] for i in index] # get sentences of paragraphs target_sentence = [ random.randint(0, len(sen) - 1) for sen in sentence ] # set target sentence for ICT training remove_target = [ random.random() < (1 - config.remove_percent) for _ in range(len(target_sentence)) ] # determine removal of original sentence as mention in paper target_context = [ sen[:i] + sen[i + remove:] for i, sen, remove in zip(target_sentence, sentence, remove_target) ] # set sentences of target context target_context = [[y for x in context for y in x] for context in target_context ] # concat sentences of context target_context = [[start_token] + context for context in target_context] target_sentence = [sen[i] for i, sen in zip(target_sentence, sentence)] target_sentence = [[start_token] + sen for sen in target_sentence] s, s_mask = util.pad_sequence(target_sentence, max_seq=config.max_seq, device=config.device) # pad sequence c, c_mask = util.pad_sequence(target_context, max_seq=config.max_seq, device=config.device) return s, s_mask, c, c_mask
def get_semantic_sim(model): "make semantic embedding of context, question. and get similarity" context_embedding = [] question_embedding = [] model.eval() with torch.no_grad(): for i in tqdm(range(0, len(contexts), config.test_batch_size)): c = [[y for x in context["sentence"] for y in x] for context in contexts[i:i + config.test_batch_size]] c, c_mask = util.pad_sequence(c, max_seq=config.max_seq, device=config.device) c_encode = model(x=c, x_mask=c_mask) context_embedding.append(c_encode.detach().cpu().numpy()) for i in tqdm(range(0, len(q_wordpiece), config.test_batch_size)): q = [ tokens for tokens in q_wordpiece[i:i + config.test_batch_size] ] q, q_mask = util.pad_sequence(q, max_seq=config.max_seq, device=config.device) q_encode = model(x=q, x_mask=q_mask) question_embedding.append(q_encode.detach().cpu().numpy()) context_embedding = np.concatenate(context_embedding, axis=0) question_embedding = np.concatenate(question_embedding, axis=0) return util.get_sim(question_embedding, context_embedding)
def preprocess_batch(a_batch): global FEATURE_LENGTH, VECTOR_EMBEDDINGS, NB_ARGUMENT_LABELS _input_seqs = [] _output_seqs = [] _lengths = [] for _ds in a_batch: _lengths.append(len(_ds.seq_features)) _max_length = max(_lengths) for _ds in a_batch: _seq = [] for feature in _ds.seq_features: element = [] for l in feature: if isinstance(l, str): element.extend( list( VECTOR_EMBEDDINGS.get(l, VECTOR_EMBEDDINGS["_unk_"]))) else: element.append(l) _seq.append(element) _input_seqs.append(util.pad_sequence(_seq, _max_length, FEATURE_LENGTH)) _output_seqs.append( util.pad_sequence(list(_ds.seq_labels), _max_length, 1)) _input_seqs = np.array(_input_seqs) _output_seqs = np.array(_output_seqs) _lengths = np.array(_lengths) return _input_seqs, _output_seqs, _lengths, _max_length
def generate_answers(sess, model, dataset, rev_vocab): """ Loop over the dev or test dataset and generate answer. Note: output format must be answers[uuid] = "real answer" You must provide a string of words instead of just a list, or start and end index In main() function we are dumping onto a JSON file evaluate.py will take the output JSON along with the original JSON file and output a F1 and EM You must implement this function in order to submit to Leaderboard. :param sess: active TF session :param model: a built QASystem model :param rev_vocab: this is a list of vocabulary that maps index to actual words :return: """ answers = {} (context, question, question_uuid_data) = dataset context_data = convert_data_to_list(context) question_data = convert_data_to_list(question) context_padded, context_mask = pad_sequence(context_data, FLAGS.max_context_len) question_padded, question_mask = pad_sequence(question_data, FLAGS.max_question_len) input_data = vectorize(context_padded, context_mask, question_padded, question_mask) minibatch_size = 20 for start in tqdm(range(0, len(question_uuid_data), minibatch_size), desc="predicting on test"): h_s, h_e = model.decode(sess, input_data[start:start + minibatch_size]) iter_num = min(minibatch_size, len(question_uuid_data) - start - minibatch_size) for i in range(iter_num): a_s = np.argmax(h_s[i]) a_e = np.argmax(h_e[i]) if a_s > a_e: k = a_e a_e = a_s a_s = k uuid = question_uuid_data[start + i] sample_dataset = [input_data[start + i]] context = sample_dataset[0][0] predicted_answer = model.formulate_answer(context, rev_vocab, a_s, a_e) answers[uuid] = predicted_answer return answers
def preprocess_batch(a_batch): """ Preprocess a batch and return input output sequences, actual lengths of each sequence in the batch and max length in the batch. :param a_batch: :return: """ global FEATURE_LENGTH, VECTOR_EMBEDDINGS, NB_ARGUMENT_LABELS, USE_BI_LSTM _input_seqs = [] _output_seqs = [] _lengths = [] _input_fw_seqs = [] _input_bw_seqs = [] for _ds in a_batch: _lengths.append(len(_ds.seq_features)) _max_length = max(_lengths) for _ds in a_batch: _seq = [] for feature in _ds.seq_features: element = [] for l in feature: if isinstance(l, str): element.extend( list( VECTOR_EMBEDDINGS.get(l, VECTOR_EMBEDDINGS["_unk_"]))) else: element.append(l) _seq.append(element) if USE_BI_LSTM: _input_fw_seqs.append( util.pad_sequence(_seq, _max_length, FEATURE_LENGTH)) _input_bw_seqs.append( util.pad_sequence(list(reversed(_seq)), _max_length, FEATURE_LENGTH)) else: _input_seqs.append( util.pad_sequence(_seq, _max_length, FEATURE_LENGTH)) _output_seqs.append( util.pad_sequence(list(_ds.seq_labels), _max_length, 1)) _input_seqs = np.array(_input_seqs) _input_fw_seqs = np.array(_input_fw_seqs) _input_bw_seqs = np.array(_input_bw_seqs) _output_seqs = np.array(_output_seqs) _lengths = np.array(_lengths) if USE_BI_LSTM: return _input_fw_seqs, _input_bw_seqs, _output_seqs, _lengths, _max_length else: return _input_seqs, _output_seqs, _lengths, _max_length
def generate_answers(sess, model, dataset, rev_vocab): """ Loop over the dev or test dataset and generate answer. Note: output format must be answers[uuid] = "real answer" You must provide a string of words instead of just a list, or start and end index In main() function we are dumping onto a JSON file evaluate.py will take the output JSON along with the original JSON file and output a F1 and EM You must implement this function in order to submit to Leaderboard. :param sess: active TF session :param model: a built QASystem model :param rev_vocab: this is a list of vocabulary that maps index to actual words :return: """ answers = {} (context, question, question_uuid_data) = dataset context_data = convert_data_to_list(context) question_data = convert_data_to_list(question) context_padded, context_mask = pad_sequence(context_data, FLAGS.max_context_len) question_padded, question_mask = pad_sequence(question_data, FLAGS.max_question_len) input_data = vectorize(context_padded, context_mask, question_padded, question_mask, question_uuid_data) batch_size = 32 num_batches = int(len(input_data) / batch_size) + 1 prog = Progbar(target=num_batches) for i, batch in enumerate(minibatches(input_data, batch_size)): a_s_vec, a_e_vec = model.answer(sess, batch) prog.update(i + 1) for (a_s, a_e, context, uuid) in zip(a_s_vec, a_e_vec, batch[0], batch[4]): if a_s > a_e: tmp = a_s a_s = a_e a_e = tmp predicted_answer = model.formulate_answer(context, rev_vocab, a_s, a_e) answers[uuid] = predicted_answer return answers
def get_inputs(question, evidences, word2idx): question_list = [] evidence_list = [] q_list = [] e_list = [] q_mask_list = [] e_mask_list = [] ques, q_len = get_chars(question, word2idx) question, q_mask = pad_sequence(ques, param.question_size, word2idx) nb_evid = len(evidences) for i, e in enumerate(evidences): e, e_len = get_chars(e, word2idx) if e_len == 0: continue other_id = random.randint(0, nb_evid - 1) if nb_evid != 1: while other_id == i: other_id = random.randint(0, nb_evid - 1) other_evidence = evidences[other_id] other_evidence, _ = get_chars(other_evidence, word2idx) q_feat = get_feats(ques, e) e_feat = get_feats(other_evidence, e) evidence, e_mask = pad_sequence(e, param.evidence_size, word2idx) q_tags, _ = pad_sequence(q_feat, param.evidence_size, word2idx) e_tags, _ = pad_sequence(e_feat, param.evidence_size, word2idx) question_list.append(question) evidence_list.append(evidence) q_list.append(q_tags) e_list.append(e_tags) q_mask_list.append(q_mask) e_mask_list.append(e_mask) question = Variable(torch.LongTensor(question_list)).cuda() evidence = Variable(torch.LongTensor(evidence_list)).cuda() e_feat = Variable(torch.LongTensor(e_list)).cuda() q_feat = Variable(torch.LongTensor(q_list)).cuda() q_mask = Variable(torch.ByteTensor(q_mask_list)).cuda() e_mask = Variable(torch.ByteTensor(e_mask_list)).cuda() return question, evidence, q_mask, e_mask, q_feat, e_feat