def __init__(self, instances, options, word_vocab=None): self.options = options self.instances = instances # list of tuples self.batch_size = len(instances) self.vocab = word_vocab # node num self.node_num = [] # [batch_size] for (lex_idx, lex_chars_idx, in_neigh, in_neigh_hidden, in_label_idx, entity_indices, y) in instances: self.node_num.append(len(lex_idx)) self.node_num = np.array(self.node_num, dtype=np.int32) # node char num if options.with_char: self.nodes_chars_num = [[len(lex_chars_idx) for lex_chars_idx in instance[1]] for instance in instances] self.nodes_chars_num = padding_utils.pad_2d_vals_no_size(self.nodes_chars_num) # neigh mask self.in_neigh_mask = [] # [batch_size, node_num, neigh_num] self.entity_indices_mask = [] for instance in instances: ins = [] for in_neighs in instance[2]: ins.append([1 for _ in in_neighs]) self.in_neigh_mask.append(ins) idxs = [] for entity_indices in instance[5]: idxs.append([1 for _ in entity_indices]) self.entity_indices_mask.append(idxs) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(self.in_neigh_mask) self.entity_indices_mask = padding_utils.pad_3d_vals_no_size(self.entity_indices_mask) # the actual contents self.nodes = [x[0] for x in instances] if options.with_char: self.nodes_chars = [x[1] for x in instances] # [batch_size, sent_len, char_num] self.in_neigh_indices = [x[2] for x in instances] self.in_neigh_hidden_indices = [x[3] for x in instances] self.in_neigh_edges = [x[4] for x in instances] self.entity_indices = [x[5] for x in instances] self.y = [x[6] for x in instances] # making ndarray self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes) if options.with_char: self.nodes_chars = padding_utils.pad_3d_vals_no_size(self.nodes_chars) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(self.in_neigh_indices) self.in_neigh_hidden_indices = padding_utils.pad_3d_vals_no_size(self.in_neigh_hidden_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(self.in_neigh_edges) self.entity_indices = padding_utils.pad_3d_vals_no_size(self.entity_indices) self.y = np.asarray(self.y, dtype='int32') assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_hidden_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.entity_indices_mask.shape == self.entity_indices.shape assert self.entity_indices.shape[1] == options.entity_num assert self.entity_indices_mask.shape[1] == options.entity_num
def build_phrase_vocabs(self): self.phrase_vocabs = [] word_size = self.vocab.vocab_size + 1 self.phrase_starts = [] self.phrase_ends = [] self.phrase_idx = [] self.phrase_lengths = [] self.max_phrase_size = 0 if self.options.with_target_lattice: self.target_lattices = [] for (sent1, sent2, sent3) in self.instances: # collect all phrases if self.options.withSyntaxChunk: (cur_phrase_starts, cur_phrase_ends, _) = sent1.collect_all_syntax_chunks(self.options.max_chunk_len) else: (cur_phrase_starts, cur_phrase_ends) = sent1.collect_all_possible_chunks(self.options.max_chunk_len) # collect phrase vocab and map phrase into phrase_id cur_phrase2id = {} cur_phrase_idx = [] for i in xrange(len(cur_phrase_starts)): cur_start = cur_phrase_starts[i] cur_end = cur_phrase_ends[i] cur_phrase = sent1.getTokChunk(cur_start, cur_end) cur_index = None if cur_start==cur_end: cur_index = self.vocab.getIndex(cur_phrase) elif cur_phrase2id.has_key(cur_phrase): cur_index = cur_phrase2id[cur_phrase] else: cur_index = len(cur_phrase2id) + word_size cur_phrase2id[cur_phrase] = cur_index cur_phrase_idx.append(cur_index) cur_phrase_vocab = phrase_lattice_utils.prefix_tree(cur_phrase2id) self.phrase_vocabs.append(cur_phrase_vocab) self.phrase_starts.append(cur_phrase_starts) self.phrase_ends.append(cur_phrase_ends) self.phrase_idx.append(cur_phrase_idx) self.phrase_lengths.append(len(cur_phrase_starts)) cur_phrase_size = len(cur_phrase2id) if self.max_phrase_size<cur_phrase_size: self.max_phrase_size = cur_phrase_size if self.options.with_target_lattice: cur_lattice = phrase_lattice_utils.phrase_lattice(sent2.words, word_vocab=self.vocab, prefix_tree=cur_phrase_vocab) self.target_lattices.append(cur_lattice) self.phrase_starts = padding_utils.pad_2d_vals_no_size(self.phrase_starts) # [batch_size, phrase_size] self.phrase_ends = padding_utils.pad_2d_vals_no_size(self.phrase_ends) # [batch_size, phrase_size] self.phrase_idx = padding_utils.pad_2d_vals_no_size(self.phrase_idx) # [batch_size, phrase_size] self.phrase_lengths = np.array(self.phrase_lengths, dtype=np.int32) # [batch_size]
def __init__(self, ori_batch): self.options = ori_batch.options self.amr_node = ori_batch.amr_node self.id = ori_batch.id self.target_ref = ori_batch.target_ref self.batch_size = ori_batch.batch_size self.vocab = ori_batch.vocab self.node_num = np.array(ori_batch.node_num, dtype=np.int32) self.sent_len = np.array(ori_batch.sent_len, dtype=np.int32) self.sent_pos_len = np.array(ori_batch.sent_pos_len, dtype=np.int32) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_mask) self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_mask) # making ndarray self.nodes = padding_utils.pad_2d_vals_no_size(ori_batch.nodes) if self.options.with_char: self.nodes_chars = padding_utils.pad_3d_vals_no_size(ori_batch.nodes_chars) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_edges) self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_indices) self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_edges) assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.out_neigh_mask.shape == self.out_neigh_indices.shape assert self.out_neigh_mask.shape == self.out_neigh_edges.shape # [batch_size, sent_len_max] self.sent_inp = padding_utils.pad_2d_vals(ori_batch.sent_inp, len(ori_batch.sent_inp), self.options.max_answer_len) self.sent_out = padding_utils.pad_2d_vals(ori_batch.sent_out, len(ori_batch.sent_out), self.options.max_answer_len) self.sent_pos_inp = padding_utils.pad_2d_vals(ori_batch.sent_pos_inp, len(ori_batch.sent_pos_inp), self.options.max_answer_len) self.sent_pos_out = padding_utils.pad_2d_vals(ori_batch.sent_pos_out, len(ori_batch.sent_pos_out), self.options.max_answer_len)
def __init__(self, ori_batch): self.batch_size = ori_batch.batch_size self.options = ori_batch.options self.vocab = ori_batch.vocab self.char_vocab = ori_batch.char_vocab self.ids = ori_batch.ids self.candidates_str = ori_batch.candidates_str self.ref_str = ori_batch.ref_str # making ndarray self.question = padding_utils.pad_2d_vals_no_size(ori_batch.question) self.question_len = np.array(ori_batch.question_len, dtype=np.int32) self.passage = padding_utils.pad_2d_vals_no_size(ori_batch.passage) self.passage_len = np.array(ori_batch.passage_len, dtype=np.int32) self.entity_start = padding_utils.pad_2d_vals_no_size( ori_batch.entity_start) self.entity_end = padding_utils.pad_2d_vals_no_size( ori_batch.entity_end) self.entity_len = np.array(ori_batch.entity_len, dtype=np.int32) if self.options.with_grn or self.options.with_gcn: self.entity_edges = padding_utils.pad_3d_vals_no_size( ori_batch.entity_edges) self.entity_edges_mask = padding_utils.pad_3d_vals_no_size( ori_batch.entity_edges_mask, dtype=np.float32) else: self.entity_edges = None self.entity_edges_mask = None self.cands = padding_utils.pad_3d_vals_no_size(ori_batch.cands) self.cands_len = np.array(ori_batch.cands_len, dtype=np.int32) self.cands_occur_mask = padding_utils.pad_3d_vals_no_size( ori_batch.cands_occur_mask, dtype=np.float32) if ori_batch.ref != None: self.ref = np.array(ori_batch.ref, dtype=np.int32) else: self.ref = None if self.options.with_char: self.question_chars = padding_utils.pad_3d_vals_no_size( ori_batch.question_chars) self.question_chars_num = padding_utils.pad_2d_vals_no_size( ori_batch.question_chars_num) self.passage_chars = padding_utils.pad_3d_vals_no_size( ori_batch.passage_chars) self.passage_chars_num = padding_utils.pad_2d_vals_no_size( ori_batch.passage_chars_num)
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None): self.options = options self.instances = instances self.batch_size = len(instances) self.vocab = word_vocab self.passage_words = [ instances[i][0].tokText.split() for i in range(self.batch_size) ] self.has_sent3 = False if instances[0][2] is not None: self.has_sent3 = True # create length self.sent1_length = [] # [batch_size] self.sent2_length = [] # [batch_size] if self.has_sent3: self.sent3_length = [] # [batch_size] for (sent1, sent2, sent3) in instances: self.sent1_length.append(sent1.get_length()) self.sent2_length.append(sent2.get_length()) if self.has_sent3: self.sent3_length.append(sent3.get_length()) self.sent1_length = np.array(self.sent1_length, dtype=np.int32) self.sent2_length = np.array(self.sent2_length, dtype=np.int32) if self.has_sent3: self.sent3_length = np.array(self.sent3_length, dtype=np.int32) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') if options.with_word: self.sent1_word = [] # [batch_size, sent1_len] self.sent2_word = [] # [batch_size, sent2_len] self.sent2_input_word = [] if self.has_sent3: self.sent3_word = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_word.append(sent1.word_idx_seq) self.sent2_word.append(sent2.word_idx_seq) self.sent2_input_word.append([start_id] + sent2.word_idx_seq[:-1]) if self.has_sent3: self.sent3_word.append(sent3.word_idx_seq) self.sent1_word = padding_utils.pad_2d_vals_no_size( self.sent1_word) self.sent2_word = padding_utils.pad_2d_vals( self.sent2_word, len(self.sent2_word), options.max_answer_len) self.sent2_input_word = padding_utils.pad_2d_vals( self.sent2_input_word, len(self.sent2_input_word), options.max_answer_len) if self.has_sent3: self.sent3_word = padding_utils.pad_2d_vals_no_size( self.sent3_word) self.in_answer_words = self.sent2_word self.gen_input_words = self.sent2_input_word self.answer_lengths = self.sent2_length if options.with_char: self.sent1_char = [] # [batch_size, sent1_len] self.sent2_char = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_char = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_char.append(sent1.char_idx_seq) self.sent2_char.append(sent2.char_idx_seq) if self.has_sent3: self.sent3_char.append(sent3.char_idx_seq) self.sent1_char = padding_utils.pad_3d_vals_no_size( self.sent1_char) self.sent2_char = padding_utils.pad_3d_vals_no_size( self.sent2_char) if self.has_sent3: self.sent3_char = padding_utils.pad_3d_vals_no_size( self.sent3_char) if options.with_POS: self.sent1_POS = [] # [batch_size, sent1_len] self.sent2_POS = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_POS = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_POS.append(sent1.POS_idx_seq) self.sent2_POS.append(sent2.POS_idx_seq) if self.has_sent3: self.sent3_POS.append(sent3.POS_idx_seq) self.sent1_POS = padding_utils.pad_2d_vals_no_size(self.sent1_POS) self.sent2_POS = padding_utils.pad_2d_vals_no_size(self.sent2_POS) if self.has_sent3: self.sent3_POS = padding_utils.pad_2d_vals_no_size( self.sent3_POS) if options.with_NER: self.sent1_NER = [] # [batch_size, sent1_len] self.sent2_NER = [] # [batch_size, sent2_len] if self.has_sent3: self.sent3_NER = [] # [batch_size, sent3_len] for (sent1, sent2, sent3) in instances: self.sent1_NER.append(sent1.NER_idx_seq) self.sent2_NER.append(sent2.NER_idx_seq) if self.has_sent3: self.sent3_NER.append(sent3.NER_idx_seq) self.sent1_NER = padding_utils.pad_2d_vals_no_size(self.sent1_NER) self.sent2_NER = padding_utils.pad_2d_vals_no_size(self.sent2_NER) if self.has_sent3: self.sent3_NER = padding_utils.pad_2d_vals_no_size( self.sent3_NER) if options.with_phrase_projection: self.build_phrase_vocabs() if options.pretrain_with_max_matching and options.with_target_lattice: (_, prediction_lengths, generator_input_idx, generator_output_idx) = self.sample_a_partition( max_matching=True) self.in_answer_words = generator_output_idx self.gen_input_words = generator_input_idx self.answer_lengths = prediction_lengths
def __init__(self, instances, options, word_vocab=None): self.options = options self.amr_node = [x[0] for x in instances] self.id = [x[-1] for x in instances] self.target_ref = [x[-2] for x in instances] # list of tuples self.batch_size = len(instances) self.vocab = word_vocab # create length self.node_num = [] # [batch_size] self.sent_len = [] # [batch_size] for (nodes_idx, nodes_chars_idx, in_neigh_indices, in_neigh_edges_idx, out_neigh_indices, out_neigh_edges_idx, sentence_idx, sentence, id) in instances: self.node_num.append(len(nodes_idx)) self.sent_len.append( min(len(sentence_idx) + 1, options.max_answer_len)) self.node_num = np.array(self.node_num, dtype=np.int32) self.sent_len = np.array(self.sent_len, dtype=np.int32) # node char num if options.with_char: self.nodes_chars_num = [[ len(nodes_chars_idx) for nodes_chars_idx in instance[1] ] for instance in instances] self.nodes_chars_num = padding_utils.pad_2d_vals_no_size( self.nodes_chars_num) # neigh mask self.in_neigh_mask = [] # [batch_size, node_num, neigh_num] self.out_neigh_mask = [] for instance in instances: ins = [] for in_neighs in instance[2]: ins.append([1 for _ in in_neighs]) self.in_neigh_mask.append(ins) outs = [] for out_neighs in instance[4]: outs.append([1 for _ in out_neighs]) self.out_neigh_mask.append(outs) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size( self.in_neigh_mask) self.out_neigh_mask = padding_utils.pad_3d_vals_no_size( self.out_neigh_mask) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') self.nodes = [x[0] for x in instances] if options.with_char: self.nodes_chars = [inst[1] for inst in instances ] # [batch_size, sent_len, char_num] self.in_neigh_indices = [x[2] for x in instances] self.in_neigh_edges = [x[3] for x in instances] self.out_neigh_indices = [x[4] for x in instances] self.out_neigh_edges = [x[5] for x in instances] self.sent_inp = [] self.sent_out = [] for _, _, _, _, _, _, sentence_idx, sentence, id in instances: if len(sentence_idx) < options.max_answer_len: self.sent_inp.append([ start_id, ] + sentence_idx) self.sent_out.append(sentence_idx + [ end_id, ]) else: self.sent_inp.append([ start_id, ] + sentence_idx[:-1]) self.sent_out.append(sentence_idx) # making ndarray self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes) if options.with_char: self.nodes_chars = padding_utils.pad_3d_vals_no_size( self.nodes_chars) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size( self.in_neigh_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size( self.in_neigh_edges) self.out_neigh_indices = padding_utils.pad_3d_vals_no_size( self.out_neigh_indices) self.out_neigh_edges = padding_utils.pad_3d_vals_no_size( self.out_neigh_edges) assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.out_neigh_mask.shape == self.out_neigh_indices.shape assert self.out_neigh_mask.shape == self.out_neigh_edges.shape # [batch_size, sent_len_max] self.sent_inp = padding_utils.pad_2d_vals(self.sent_inp, len(self.sent_inp), options.max_answer_len) self.sent_out = padding_utils.pad_2d_vals(self.sent_out, len(self.sent_out), options.max_answer_len)
def __init__(self, instances, options, word_vocab=None): self.options = options self.instances = instances # list of tuples self.batch_size = len(instances) self.vocab = word_vocab # sentence length self.sentence_lengths = [] # [batch_size] for inst in instances: self.sentence_lengths.append(len(inst[0])) self.sentence_lengths = np.array(self.sentence_lengths, dtype=np.int32) # sentence char length if options.with_char: self.sentence_chars_lengths = [[ len(toks_chars_idx) for toks_chars_idx in inst[1] ] for inst in instances] self.sentence_chars_lengths = padding_utils.pad_2d_vals_no_size( self.sentence_chars_lengths) #(0-toks_idx, 1-toks_chars_idx, 2-poses_idx, 3-nes, 4-entity_indices, # 5-in_neigh, 6-in_label_idx, 7-in_prob, 8-out_neigh, 9-out_label_idx, 10-out_prob, 11-ref, 12-id) # neigh mask self.in_neigh_mask = [] # [batch_size, sentence_num, neigh_num] self.out_neigh_mask = [] # [batch_size, sentence_num, neigh_num] self.entity_indices_mask = [] # [batch_size, 2, indices] for inst in instances: eee = [[1 for x in entity] for entity in inst[4]] self.entity_indices_mask.append(eee) iii = [[1 for x in in_neigh] for in_neigh in inst[5]] self.in_neigh_mask.append(iii) ooo = [[1 for x in out_neigh] for out_neigh in inst[8]] self.out_neigh_mask.append(ooo) self.in_neigh_mask = padding_utils.pad_3d_vals_no_size( self.in_neigh_mask) self.out_neigh_mask = padding_utils.pad_3d_vals_no_size( self.out_neigh_mask) self.entity_indices_mask = padding_utils.pad_3d_vals_no_size( self.entity_indices_mask) # the actual contents self.sentence_words = [x[0] for x in instances] if options.with_char: self.sentence_chars = [x[1] for x in instances ] # [batch_size, sent_len, char_num] if options.with_POS: self.sentence_POSs = [x[2] for x in instances] self.nes = [x[3] for x in instances] # [batch_size, sent_len] self.entity_indices = [x[4] for x in instances] # [batch_size, 2, indices] self.in_neigh_indices = [x[5] for x in instances] self.in_neigh_edges = [x[6] for x in instances] self.out_neigh_indices = [x[8] for x in instances] self.out_neigh_edges = [x[9] for x in instances] if instances[0][7] != None: self.in_neigh_prob = [x[7] for x in instances] self.out_neigh_prob = [x[10] for x in instances] self.refs = [x[11] for x in instances] self.ids = [x[12] for x in instances] # making ndarray self.sentence_words = padding_utils.pad_2d_vals_no_size( self.sentence_words) if options.with_char: self.sentence_chars = padding_utils.pad_3d_vals_no_size( self.sentence_chars) if options.with_POS: self.sentence_POSs = padding_utils.pad_2d_vals_no_size( self.sentence_POSs) self.nes = padding_utils.pad_2d_vals_no_size(self.nes) self.entity_indices = padding_utils.pad_3d_vals_no_size( self.entity_indices) self.in_neigh_indices = padding_utils.pad_3d_vals_no_size( self.in_neigh_indices) self.in_neigh_edges = padding_utils.pad_3d_vals_no_size( self.in_neigh_edges) self.out_neigh_indices = padding_utils.pad_3d_vals_no_size( self.out_neigh_indices) self.out_neigh_edges = padding_utils.pad_3d_vals_no_size( self.out_neigh_edges) if instances[0][7] != None: self.in_neigh_prob = padding_utils.pad_3d_vals_no_size( self.in_neigh_prob) self.out_neigh_prob = padding_utils.pad_3d_vals_no_size( self.out_neigh_prob) self.refs = np.asarray(self.refs, dtype='int32') assert self.in_neigh_mask.shape == self.in_neigh_indices.shape assert self.in_neigh_mask.shape == self.in_neigh_edges.shape assert self.out_neigh_mask.shape == self.out_neigh_indices.shape assert self.out_neigh_mask.shape == self.out_neigh_edges.shape
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None): self.options = options self.batch_size = len(instances) self.vocab = word_vocab self.id = [inst[-1] for inst in instances] self.source = [inst[-3] for inst in instances] self.target_ref = [inst[-2] for inst in instances] # create length self.sent1_length = [] # [batch_size] self.sent2_length = [] # [batch_size] for (sent1_idx, sent2_idx, _, _, _) in instances: self.sent1_length.append(len(sent1_idx)) self.sent2_length.append( min(len(sent2_idx) + 1, options.max_answer_len)) self.sent1_length = np.array(self.sent1_length, dtype=np.int32) self.sent2_length = np.array(self.sent2_length, dtype=np.int32) # create word representation start_id = word_vocab.getIndex('<s>') end_id = word_vocab.getIndex('</s>') if options.with_word: self.sent1_word = [] # [batch_size, sent1_len] self.sent2_word = [] # [batch_size, sent2_len] self.sent2_input_word = [] for (sent1_idx, sent2_idx, _, _, _) in instances: self.sent1_word.append(sent1_idx) self.sent2_word.append(sent2_idx + [end_id]) self.sent2_input_word.append([start_id] + sent2_idx) self.sent1_word = padding_utils.pad_2d_vals( self.sent1_word, len(instances), np.max(self.sent1_length)) self.sent2_word = padding_utils.pad_2d_vals( self.sent2_word, len(instances), options.max_answer_len) self.sent2_input_word = padding_utils.pad_2d_vals( self.sent2_input_word, len(instances), options.max_answer_len) self.in_answer_words = self.sent2_word self.gen_input_words = self.sent2_input_word self.answer_lengths = self.sent2_length if options.with_char: self.sent1_char = [] # [batch_size, sent1_len] self.sent1_char_lengths = [] for (_, _, sent1, sent2, _) in instances: sent1_char_idx = char_vocab.to_character_matrix_for_list( sent1.split()[:options.max_passage_len]) self.sent1_char.append(sent1_char_idx) self.sent1_char_lengths.append( [len(x) for x in sent1_char_idx]) self.sent1_char = padding_utils.pad_3d_vals_no_size( self.sent1_char) self.sent1_char_lengths = padding_utils.pad_2d_vals_no_size( self.sent1_char_lengths)
def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None): self.options = options self.instances = instances self.batch_size = len(instances) self.action_vocab = action_vocab self.feat_vocab = feat_vocab # Added the feature indexer as batch attributes. # create length self.input_length = [] # [batch_size] self.concept_length = [] # [batch_size] self.action_length = [] # [batch_size] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_length.append(input_sent.get_length()+1) self.concept_length.append(len(concepts_idx)+1) self.action_length.append(min(options.max_answer_len,len(actions_idx))) self.input_length = np.array(self.input_length, dtype=np.int32) self.concept_length = np.array(self.concept_length, dtype=np.int32) self.action_length = np.array(self.action_length, dtype=np.int32) start_id = action_vocab.getIndex('<s>') self.action_inp = [] self.action_ref = [] self.feats = [] self.action2cid = [] self.action2wid = [] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.action_inp.append([start_id,]+actions_idx[:-1]) self.action_ref.append(actions_idx) self.feats.append(feats_idx) self.action2cid.append(action2cid) self.action2wid.append(action2wid) self.action_inp = padding_utils.pad_2d_vals(self.action_inp, len(self.action_inp), options.max_answer_len) self.action_ref = padding_utils.pad_2d_vals(self.action_ref, len(self.action_ref), options.max_answer_len) self.feats = padding_utils.pad_3d_vals(self.feats, len(self.feats), options.max_answer_len, len(self.feats[0][0])) self.action2cid = padding_utils.pad_2d_vals(self.action2cid, len(self.action2cid), options.max_answer_len) self.action2wid = padding_utils.pad_2d_vals(self.action2wid, len(self.action2wid), options.max_answer_len) append_id = word_vocab.getIndex('-NULL-') self.input_word = [] # [batch_size, sent_len] self.concept_word = [] # [batch_size, sent_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_word.append(input_sent.word_idx_seq+[append_id,]) self.concept_word.append(concepts_idx+[append_id,]) self.input_word = padding_utils.pad_2d_vals_no_size(self.input_word) self.concept_word = padding_utils.pad_2d_vals_no_size(self.concept_word) if options.with_lemma: self.input_lemma = [] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_lemma.append(input_sent.lemma_idx_seq+[append_id,]) self.input_lemma = padding_utils.pad_2d_vals_no_size(self.input_lemma) if options.with_char: assert False self.input_char = [] # [batch_size, sent_len, char_size] self.input_char_len = [] # [batch_size, sent_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_char.append(input_sent.char_idx_matrix) self.input_char_len.append([len(x) for x in input_sent.tok]) self.input_char = padding_utils.pad_3d_vals_no_size(self.input_char) self.input_char_len = padding_utils.pad_2d_vals_no_size(self.input_char_len) if options.with_POS: append_pos_id = POS_vocab.getIndex('-NULL-') self.input_POS = [] # [batch_size, sent1_len] for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances: self.input_POS.append(input_sent.POS_idx_seq+[append_pos_id,]) self.input_POS = padding_utils.pad_2d_vals_no_size(self.input_POS)