def sample_a_partition(self, max_matching=False): word_size = self.vocab.vocab_size + 1 sentences = [] prediction_lengths = [] generator_input_idx = [] generator_output_idx = [] for i, cur_lattice in enumerate(self.target_lattices): (cur_phrases, cur_phrase_ids) = cur_lattice.sample_a_partition( max_matching=max_matching) sentences.append(" ".join(cur_phrases)) prediction_lengths.append(len(cur_phrases)) generator_output_idx.append(cur_phrase_ids) cur_input_idx = [self.gen_input_words[i][0]] for cur_phrase, cur_phrase_id in zip(cur_phrases, cur_phrase_ids): if cur_phrase_id < word_size: cur_word_id = cur_phrase_id elif not self.phrase_vocabs[i].has_phrase_id( cur_phrase_id ): # if an OOV phrase is sampled, reset it to UNK cur_word_id = self.vocab.vocab_size else: cur_word_id = self.vocab.getIndex( re.split("\\s+", cur_phrase)[-1] ) # take the last word of a phrase as the input word for decoding cur_input_idx.append(cur_word_id) generator_input_idx.append(cur_input_idx[:-1]) generator_input_idx = padding_utils.pad_2d_vals( generator_input_idx, len(generator_input_idx), self.options.max_answer_len) generator_output_idx = padding_utils.pad_2d_vals( generator_output_idx, len(generator_output_idx), self.options.max_answer_len) return (sentences, prediction_lengths, generator_input_idx, generator_output_idx)
def __init__(self, ori_batch): self.options = ori_batch.options self.amr_node = ori_batch.amr_node self.id = ori_batch.id self.target_ref = ori_batch.target_ref self.batch_size = ori_batch.batch_size self.vocab = ori_batch.vocab self.node_num = np.array(ori_batch.node_num, dtype=np.int32) self.sent_len = np.array(ori_batch.sent_len, dtype=np.int32) self.sent_pos_len = np.array(ori_batch.sent_pos_len, dtype=np.int32) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_mask) self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_mask) # making ndarray self.nodes = padding_utils.pad_2d_vals_no_size(ori_batch.nodes) if self.options.with_char: self.nodes_chars = padding_utils.pad_3d_vals_no_size(ori_batch.nodes_chars) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_edges) self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_indices) self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_edges) assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.out_neigh_mask.shape == self.out_neigh_indices.shape assert self.out_neigh_mask.shape == self.out_neigh_edges.shape # [batch_size, sent_len_max] self.sent_inp = padding_utils.pad_2d_vals(ori_batch.sent_inp, len(ori_batch.sent_inp), self.options.max_answer_len) self.sent_out = padding_utils.pad_2d_vals(ori_batch.sent_out, len(ori_batch.sent_out), self.options.max_answer_len) self.sent_pos_inp = padding_utils.pad_2d_vals(ori_batch.sent_pos_inp, len(ori_batch.sent_pos_inp), self.options.max_answer_len) self.sent_pos_out = padding_utils.pad_2d_vals(ori_batch.sent_pos_out, len(ori_batch.sent_pos_out), self.options.max_answer_len)
def run_rl_training(self, sess, batch, options): feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.gen_input_words] = batch.gen_input_words sample_output, greedy_output = sess.run( [self.sampled_words, self.greedy_words], feed_dict) sample_output = sample_output.tolist() greedy_output = greedy_output.tolist() rl_inputs = [] rl_outputs = [] rl_input_lengths = [] reward = [] for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)): sout, slex = self.dec_word_vocab.getLexical(sout) gout, glex = self.dec_word_vocab.getLexical(gout) rl_inputs.append([int(batch.gen_input_words[i, 0])] + sout[:-1]) rl_outputs.append(sout) rl_input_lengths.append(len(sout)) ref_lex = batch.instances[i][1].tokText #r = metric_utils.evaluate_captions([ref_lex,],[slex,]) #b = metric_utils.evaluate_captions([ref_lex,],[glex,]) slst = slex.split() glst = glex.split() rlst = ref_lex.split() if options.reward_type == 'bleu': r = sentence_bleu([rlst], slst, smoothing_function=cc.method3) b = sentence_bleu([rlst], glst, smoothing_function=cc.method3) elif options.reward_type == 'rouge': r = sentence_rouge(ref_lex, slex, smoothing_function=cc.method3) b = sentence_rouge(ref_lex, glex, smoothing_function=cc.method3) reward.append(r - b) #print('Ref: {}'.format(ref_lex.encode('utf-8','ignore'))) #print('Sample: {}'.format(slex.encode('utf-8','ignore'))) #print('Greedy: {}'.format(glex.encode('utf-8','ignore'))) #print('R-B: {}'.format(reward[-1])) #print('-----') rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs), self.options.max_answer_len) rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs), self.options.max_answer_len) rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32) reward = np.array(reward, dtype=np.float32) assert rl_inputs.shape == rl_outputs.shape feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.reward] = reward feed_dict[self.gen_input_words] = rl_inputs feed_dict[self.in_answer_words] = rl_outputs feed_dict[self.answer_lengths] = rl_input_lengths _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss
def run_rl_training(self, sess, batch, options): assert False, 'not supported yet' flipp = options.flipp if options.__dict__.has_key('flipp') else 0.1 # make feed_dict feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.action_inp] = batch.action_inp # get greedy and gold outputs greedy_output = sess.run(self.greedy_words, feed_dict) greedy_output = greedy_output.tolist() gold_output = batch.in_answer_words.tolist() # generate sample_output by flipping coins sample_output = np.copy(batch.action_out) for i in range(batch.in_answer_words.shape[0]): seq_len = min(options.max_answer_len, batch.action_length[i] - 1) # don't change stop token '</s>' for j in range(seq_len): if greedy_output[i][j] != 0 and random.random() < flipp: sample_output[i, j] = greedy_output[i][j] sample_output = sample_output.tolist() rl_inputs = [] rl_outputs = [] rl_input_lengths = [] reward = [] for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)): sout, slex = self.word_vocab.getLexical(sout) gout, glex = self.word_vocab.getLexical(gout) rl_inputs.append([int(batch.gen_input_words[i, 0])] + sout[:-1]) rl_outputs.append(sout) rl_input_lengths.append(len(sout)) _, ref_lex = self.word_vocab.getLexical(gold_output[i]) slst = slex.split() glst = glex.split() rlst = ref_lex.split() reward.append(r - b) rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs), self.options.max_answer_len) rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs), self.options.max_answer_len) rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32) reward = np.array(reward, dtype=np.float32) assert rl_inputs.shape == rl_outputs.shape feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.reward] = reward feed_dict[self.gen_input_words] = rl_inputs feed_dict[self.in_answer_words] = rl_outputs feed_dict[self.answer_lengths] = rl_input_lengths _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss
def run_rl_training_subsample(self, sess, batch, options): flipp = options.flipp if options.__dict__.has_key('flipp') else 0.1 # make feed_dict feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.answer_inp] = batch.sent_inp # get greedy and gold outputs greedy_output = sess.run(self.greedy_words, feed_dict) # [batch, sent_len] greedy_output = greedy_output.tolist() gold_output = batch.sent_out.tolist() # generate sample_output by flipping coins sample_output = np.copy(batch.sent_out) for i in range(batch.sent_out.shape[0]): seq_len = min(options.max_answer_len, batch.sent_len[i] - 1) # don't change stop token '</s>' for j in range(seq_len): if greedy_output[i][j] != 0 and random.random() < flipp: sample_output[i, j] = greedy_output[i][j] sample_output = sample_output.tolist() st_wid = self.word_vocab.getIndex('<s>') en_wid = self.word_vocab.getIndex('</s>') rl_inputs = [] rl_outputs = [] rl_input_lengths = [] reward = [] for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)): sout, slex = self.word_vocab.getLexical(sout) gout, glex = self.word_vocab.getLexical(gout) rl_inputs.append([ st_wid, ] + sout[:-1]) rl_outputs.append(sout) rl_input_lengths.append(len(sout)) _, ref_lex = self.word_vocab.getLexical(gold_output[i]) slst = slex.split() glst = glex.split() rlst = ref_lex.split() if options.reward_type == 'bleu': r = sentence_bleu([rlst], slst, smoothing_function=cc.method3) b = sentence_bleu([rlst], glst, smoothing_function=cc.method3) elif options.reward_type == 'rouge': r = sentence_rouge(ref_lex, slex, smoothing_function=cc.method3) b = sentence_rouge(ref_lex, glex, smoothing_function=cc.method3) reward.append(r - b) #print('Ref: {}'.format(ref_lex.encode('utf-8','ignore'))) #print('Sample: {}'.format(slex.encode('utf-8','ignore'))) #print('Greedy: {}'.format(glex.encode('utf-8','ignore'))) #print('R-B: {}'.format(reward[-1])) #print('-----') rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs), self.options.max_answer_len) rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs), self.options.max_answer_len) rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32) reward = np.array(reward, dtype=np.float32) assert rl_inputs.shape == rl_outputs.shape feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True) feed_dict[self.reward] = reward feed_dict[self.answer_inp] = rl_inputs feed_dict[self.answer_ref] = rl_outputs feed_dict[self.answer_len] = rl_input_lengths _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None): self.options = options self.instances = instances self.batch_size = len(instances) self.vocab = word_vocab self.passage_words = [ instances[i][0].tokText.split() for i in range(self.batch_size) ] self.has_sent3 = False if instances[0][2] is not None: self.has_sent3 = True # create length self.sent1_length = [] # [batch_size] self.sent2_length = [] # [batch_size] if self.has_sent3: self.sent3_length = [] # [batch_size] for (sent1, sent2, sent3) in instances: self.sent1_length.append(sent1.get_length()) self.sent2_length.append(sent2.get_length()) if self.has_sent3: self.sent3_length.append(sent3.get_length()) self.sent1_length = np.array(self.sent1_length, dtype=np.int32) self.sent2_length = np.array(self.sent2_length, dtype=np.int32) if self.has_sent3: self.sent3_length = np.array(self.sent3_length, dtype=np.int32) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') if options.with_word: self.sent1_word = [] # [batch_size, sent1_len] self.sent2_word = [] # [batch_size, sent2_len] self.sent2_input_word = [] if self.has_sent3: self.sent3_word = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_word.append(sent1.word_idx_seq) self.sent2_word.append(sent2.word_idx_seq) self.sent2_input_word.append([start_id] + sent2.word_idx_seq[:-1]) if self.has_sent3: self.sent3_word.append(sent3.word_idx_seq) self.sent1_word = padding_utils.pad_2d_vals_no_size( self.sent1_word) self.sent2_word = padding_utils.pad_2d_vals( self.sent2_word, len(self.sent2_word), options.max_answer_len) self.sent2_input_word = padding_utils.pad_2d_vals( self.sent2_input_word, len(self.sent2_input_word), options.max_answer_len) if self.has_sent3: self.sent3_word = padding_utils.pad_2d_vals_no_size( self.sent3_word) self.in_answer_words = self.sent2_word self.gen_input_words = self.sent2_input_word self.answer_lengths = self.sent2_length if options.with_char: self.sent1_char = [] # [batch_size, sent1_len] self.sent2_char = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_char = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_char.append(sent1.char_idx_seq) self.sent2_char.append(sent2.char_idx_seq) if self.has_sent3: self.sent3_char.append(sent3.char_idx_seq) self.sent1_char = padding_utils.pad_3d_vals_no_size( self.sent1_char) self.sent2_char = padding_utils.pad_3d_vals_no_size( self.sent2_char) if self.has_sent3: self.sent3_char = padding_utils.pad_3d_vals_no_size( self.sent3_char) if options.with_POS: self.sent1_POS = [] # [batch_size, sent1_len] self.sent2_POS = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_POS = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_POS.append(sent1.POS_idx_seq) self.sent2_POS.append(sent2.POS_idx_seq) if self.has_sent3: self.sent3_POS.append(sent3.POS_idx_seq) self.sent1_POS = padding_utils.pad_2d_vals_no_size(self.sent1_POS) self.sent2_POS = padding_utils.pad_2d_vals_no_size(self.sent2_POS) if self.has_sent3: self.sent3_POS = padding_utils.pad_2d_vals_no_size( self.sent3_POS) if options.with_NER: self.sent1_NER = [] # [batch_size, sent1_len] self.sent2_NER = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_NER = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_NER.append(sent1.NER_idx_seq) self.sent2_NER.append(sent2.NER_idx_seq) if self.has_sent3: self.sent3_NER.append(sent3.NER_idx_seq) self.sent1_NER = padding_utils.pad_2d_vals_no_size(self.sent1_NER) self.sent2_NER = padding_utils.pad_2d_vals_no_size(self.sent2_NER) if self.has_sent3: self.sent3_NER = padding_utils.pad_2d_vals_no_size( self.sent3_NER) if options.with_phrase_projection: self.build_phrase_vocabs() if options.pretrain_with_max_matching and options.with_target_lattice: (_, prediction_lengths, generator_input_idx, generator_output_idx) = self.sample_a_partition( max_matching=True) self.in_answer_words = generator_output_idx self.gen_input_words = generator_input_idx self.answer_lengths = prediction_lengths
def __init__(self, instances, options, word_vocab=None): self.options = options self.amr_node = [x[0] for x in instances] self.id = [x[-1] for x in instances] self.target_ref = [x[-2] for x in instances] # list of tuples self.batch_size = len(instances) self.vocab = word_vocab # create length self.node_num = [] # [batch_size] self.sent_len = [] # [batch_size] for (nodes_idx, nodes_chars_idx, in_neigh_indices, in_neigh_edges_idx, out_neigh_indices, out_neigh_edges_idx, sentence_idx, sentence, id) in instances: self.node_num.append(len(nodes_idx)) self.sent_len.append( min(len(sentence_idx) + 1, options.max_answer_len)) self.node_num = np.array(self.node_num, dtype=np.int32) self.sent_len = np.array(self.sent_len, dtype=np.int32) # node char num if options.with_char: self.nodes_chars_num = [[ len(nodes_chars_idx) for nodes_chars_idx in instance[1] ] for instance in instances] self.nodes_chars_num = padding_utils.pad_2d_vals_no_size( self.nodes_chars_num) # neigh mask self.in_neigh_mask = [] # [batch_size, node_num, neigh_num] self.out_neigh_mask = [] for instance in instances: ins = [] for in_neighs in instance[2]: ins.append([1 for _ in in_neighs]) self.in_neigh_mask.append(ins) outs = [] for out_neighs in instance[4]: outs.append([1 for _ in out_neighs]) self.out_neigh_mask.append(outs) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size( self.in_neigh_mask) self.out_neigh_mask = padding_utils.pad_3d_vals_no_size( self.out_neigh_mask) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') self.nodes = [x[0] for x in instances] if options.with_char: self.nodes_chars = [inst[1] for inst in instances ] # [batch_size, sent_len, char_num] self.in_neigh_indices = [x[2] for x in instances] self.in_neigh_edges = [x[3] for x in instances] self.out_neigh_indices = [x[4] for x in instances] self.out_neigh_edges = [x[5] for x in instances] self.sent_inp = [] self.sent_out = [] for _, _, _, _, _, _, sentence_idx, sentence, id in instances: if len(sentence_idx) < options.max_answer_len: self.sent_inp.append([ start_id, ] + sentence_idx) self.sent_out.append(sentence_idx + [ end_id, ]) else: self.sent_inp.append([ start_id, ] + sentence_idx[:-1]) self.sent_out.append(sentence_idx) # making ndarray self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes) if options.with_char: self.nodes_chars = padding_utils.pad_3d_vals_no_size( self.nodes_chars) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size( self.in_neigh_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size( self.in_neigh_edges) self.out_neigh_indices = padding_utils.pad_3d_vals_no_size( self.out_neigh_indices) self.out_neigh_edges = padding_utils.pad_3d_vals_no_size( self.out_neigh_edges) assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.out_neigh_mask.shape == self.out_neigh_indices.shape assert self.out_neigh_mask.shape == self.out_neigh_edges.shape # [batch_size, sent_len_max] self.sent_inp = padding_utils.pad_2d_vals(self.sent_inp, len(self.sent_inp), options.max_answer_len) self.sent_out = padding_utils.pad_2d_vals(self.sent_out, len(self.sent_out), options.max_answer_len)
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None): self.options = options self.batch_size = len(instances) self.vocab = word_vocab self.id = [inst[-1] for inst in instances] self.source = [inst[-3] for inst in instances] self.target_ref = [inst[-2] for inst in instances] # create length self.sent1_length = [] # [batch_size] self.sent2_length = [] # [batch_size] for (sent1_idx, sent2_idx, _, _, _) in instances: self.sent1_length.append(len(sent1_idx)) self.sent2_length.append( min(len(sent2_idx) + 1, options.max_answer_len)) self.sent1_length = np.array(self.sent1_length, dtype=np.int32) self.sent2_length = np.array(self.sent2_length, dtype=np.int32) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') if options.with_word: self.sent1_word = [] # [batch_size, sent1_len] self.sent2_word = [] # [batch_size, sent2_len] self.sent2_input_word = [] for (sent1_idx, sent2_idx, _, _, _) in instances: self.sent1_word.append(sent1_idx) self.sent2_word.append(sent2_idx + [end_id]) self.sent2_input_word.append([start_id] + sent2_idx) self.sent1_word = padding_utils.pad_2d_vals( self.sent1_word, len(instances), np.max(self.sent1_length)) self.sent2_word = padding_utils.pad_2d_vals( self.sent2_word, len(instances), options.max_answer_len) self.sent2_input_word = padding_utils.pad_2d_vals( self.sent2_input_word, len(instances), options.max_answer_len) self.in_answer_words = self.sent2_word self.gen_input_words = self.sent2_input_word self.answer_lengths = self.sent2_length if options.with_char: self.sent1_char = [] # [batch_size, sent1_len] self.sent1_char_lengths = [] for (_, _, sent1, sent2, _) in instances: sent1_char_idx = char_vocab.to_character_matrix_for_list( sent1.split()[:options.max_passage_len]) self.sent1_char.append(sent1_char_idx) self.sent1_char_lengths.append( [len(x) for x in sent1_char_idx]) self.sent1_char = padding_utils.pad_3d_vals_no_size( self.sent1_char) self.sent1_char_lengths = padding_utils.pad_2d_vals_no_size( self.sent1_char_lengths)
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None): self.options = options self.instances = instances self.batch_size = len(instances) self.action_vocab = action_vocab self.feat_vocab = feat_vocab # Added the feature indexer as batch attributes. # create length self.input_length = [] # [batch_size] self.concept_length = [] # [batch_size] self.action_length = [] # [batch_size] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_length.append(input_sent.get_length()+1) self.concept_length.append(len(concepts_idx)+1) self.action_length.append(min(options.max_answer_len,len(actions_idx))) self.input_length = np.array(self.input_length, dtype=np.int32) self.concept_length = np.array(self.concept_length, dtype=np.int32) self.action_length = np.array(self.action_length, dtype=np.int32) start_id = action_vocab.getIndex('<s>') self.action_inp = [] self.action_ref = [] self.feats = [] self.action2cid = [] self.action2wid = [] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.action_inp.append([start_id,]+actions_idx[:-1]) self.action_ref.append(actions_idx) self.feats.append(feats_idx) self.action2cid.append(action2cid) self.action2wid.append(action2wid) self.action_inp = padding_utils.pad_2d_vals(self.action_inp, len(self.action_inp), options.max_answer_len) self.action_ref = padding_utils.pad_2d_vals(self.action_ref, len(self.action_ref), options.max_answer_len) self.feats = padding_utils.pad_3d_vals(self.feats, len(self.feats), options.max_answer_len, len(self.feats[0][0])) self.action2cid = padding_utils.pad_2d_vals(self.action2cid, len(self.action2cid), options.max_answer_len) self.action2wid = padding_utils.pad_2d_vals(self.action2wid, len(self.action2wid), options.max_answer_len) append_id = word_vocab.getIndex('-NULL-') self.input_word = [] # [batch_size, sent_len] self.concept_word = [] # [batch_size, sent_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_word.append(input_sent.word_idx_seq+[append_id,]) self.concept_word.append(concepts_idx+[append_id,]) self.input_word = padding_utils.pad_2d_vals_no_size(self.input_word) self.concept_word = padding_utils.pad_2d_vals_no_size(self.concept_word) if options.with_lemma: self.input_lemma = [] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_lemma.append(input_sent.lemma_idx_seq+[append_id,]) self.input_lemma = padding_utils.pad_2d_vals_no_size(self.input_lemma) if options.with_char: assert False self.input_char = [] # [batch_size, sent_len, char_size] self.input_char_len = [] # [batch_size, sent_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_char.append(input_sent.char_idx_matrix) self.input_char_len.append([len(x) for x in input_sent.tok]) self.input_char = padding_utils.pad_3d_vals_no_size(self.input_char) self.input_char_len = padding_utils.pad_2d_vals_no_size(self.input_char_len) if options.with_POS: append_pos_id = POS_vocab.getIndex('-NULL-') self.input_POS = [] # [batch_size, sent1_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_POS.append(input_sent.POS_idx_seq+[append_pos_id,]) self.input_POS = padding_utils.pad_2d_vals_no_size(self.input_POS)