Ejemplos de pad_2d_vals en Python, ejemplos de padding_utils.pad_2d_vals en Python

Ejemplo n.º 1

0

Mostrar archivo

    def sample_a_partition(self, max_matching=False):
        word_size = self.vocab.vocab_size + 1
        sentences = []
        prediction_lengths = []
        generator_input_idx = []
        generator_output_idx = []
        for i, cur_lattice in enumerate(self.target_lattices):
            (cur_phrases, cur_phrase_ids) = cur_lattice.sample_a_partition(
                max_matching=max_matching)
            sentences.append(" ".join(cur_phrases))
            prediction_lengths.append(len(cur_phrases))
            generator_output_idx.append(cur_phrase_ids)
            cur_input_idx = [self.gen_input_words[i][0]]
            for cur_phrase, cur_phrase_id in zip(cur_phrases, cur_phrase_ids):
                if cur_phrase_id < word_size:
                    cur_word_id = cur_phrase_id
                elif not self.phrase_vocabs[i].has_phrase_id(
                        cur_phrase_id
                ):  # if an OOV phrase is sampled, reset it to UNK
                    cur_word_id = self.vocab.vocab_size
                else:
                    cur_word_id = self.vocab.getIndex(
                        re.split("\\s+", cur_phrase)[-1]
                    )  # take the last word of a phrase as the input word for decoding
                cur_input_idx.append(cur_word_id)
            generator_input_idx.append(cur_input_idx[:-1])

        generator_input_idx = padding_utils.pad_2d_vals(
            generator_input_idx, len(generator_input_idx),
            self.options.max_answer_len)
        generator_output_idx = padding_utils.pad_2d_vals(
            generator_output_idx, len(generator_output_idx),
            self.options.max_answer_len)
        return (sentences, prediction_lengths, generator_input_idx,
                generator_output_idx)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: G2S_data_stream.py Proyecto: loriqing/neural-graph-to-seq-twopass

    def __init__(self, ori_batch):
        self.options = ori_batch.options
        self.amr_node = ori_batch.amr_node
        self.id = ori_batch.id
        self.target_ref = ori_batch.target_ref
        self.batch_size = ori_batch.batch_size
        self.vocab = ori_batch.vocab

        self.node_num = np.array(ori_batch.node_num, dtype=np.int32)
        self.sent_len = np.array(ori_batch.sent_len, dtype=np.int32)
        self.sent_pos_len = np.array(ori_batch.sent_pos_len, dtype=np.int32)

        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_mask)
        self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_mask)

        # making ndarray
        self.nodes = padding_utils.pad_2d_vals_no_size(ori_batch.nodes)
        if self.options.with_char:
            self.nodes_chars = padding_utils.pad_3d_vals_no_size(ori_batch.nodes_chars)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.in_neigh_edges)
        self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_indices)
        self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(ori_batch.out_neigh_edges)

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.out_neigh_mask.shape == self.out_neigh_indices.shape
        assert self.out_neigh_mask.shape == self.out_neigh_edges.shape

        # [batch_size, sent_len_max]
        self.sent_inp = padding_utils.pad_2d_vals(ori_batch.sent_inp, len(ori_batch.sent_inp), self.options.max_answer_len)
        self.sent_out = padding_utils.pad_2d_vals(ori_batch.sent_out, len(ori_batch.sent_out), self.options.max_answer_len)
        self.sent_pos_inp = padding_utils.pad_2d_vals(ori_batch.sent_pos_inp, len(ori_batch.sent_pos_inp), self.options.max_answer_len)
        self.sent_pos_out = padding_utils.pad_2d_vals(ori_batch.sent_pos_out, len(ori_batch.sent_pos_out), self.options.max_answer_len)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: NP2P_model_graph.py Proyecto: sinhvtr/graph2seq_nldb

    def run_rl_training(self, sess, batch, options):
        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.gen_input_words] = batch.gen_input_words

        sample_output, greedy_output = sess.run(
            [self.sampled_words, self.greedy_words], feed_dict)

        sample_output = sample_output.tolist()
        greedy_output = greedy_output.tolist()

        rl_inputs = []
        rl_outputs = []
        rl_input_lengths = []
        reward = []
        for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)):
            sout, slex = self.dec_word_vocab.getLexical(sout)
            gout, glex = self.dec_word_vocab.getLexical(gout)
            rl_inputs.append([int(batch.gen_input_words[i, 0])] + sout[:-1])
            rl_outputs.append(sout)
            rl_input_lengths.append(len(sout))
            ref_lex = batch.instances[i][1].tokText
            #r = metric_utils.evaluate_captions([ref_lex,],[slex,])
            #b = metric_utils.evaluate_captions([ref_lex,],[glex,])
            slst = slex.split()
            glst = glex.split()
            rlst = ref_lex.split()
            if options.reward_type == 'bleu':
                r = sentence_bleu([rlst], slst, smoothing_function=cc.method3)
                b = sentence_bleu([rlst], glst, smoothing_function=cc.method3)
            elif options.reward_type == 'rouge':
                r = sentence_rouge(ref_lex,
                                   slex,
                                   smoothing_function=cc.method3)
                b = sentence_rouge(ref_lex,
                                   glex,
                                   smoothing_function=cc.method3)
            reward.append(r - b)
            #print('Ref: {}'.format(ref_lex.encode('utf-8','ignore')))
            #print('Sample: {}'.format(slex.encode('utf-8','ignore')))
            #print('Greedy: {}'.format(glex.encode('utf-8','ignore')))
            #print('R-B: {}'.format(reward[-1]))
            #print('-----')

        rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs),
                                              self.options.max_answer_len)
        rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs),
                                               self.options.max_answer_len)
        rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32)
        reward = np.array(reward, dtype=np.float32)
        assert rl_inputs.shape == rl_outputs.shape

        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.reward] = reward
        feed_dict[self.gen_input_words] = rl_inputs
        feed_dict[self.in_answer_words] = rl_outputs
        feed_dict[self.answer_lengths] = rl_input_lengths

        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

Ejemplo n.º 4

0

Mostrar archivo

    def run_rl_training(self, sess, batch, options):
        assert False, 'not supported yet'
        flipp = options.flipp if options.__dict__.has_key('flipp') else 0.1

        # make feed_dict
        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.action_inp] = batch.action_inp

        # get greedy and gold outputs
        greedy_output = sess.run(self.greedy_words, feed_dict)
        greedy_output = greedy_output.tolist()
        gold_output = batch.in_answer_words.tolist()

        # generate sample_output by flipping coins
        sample_output = np.copy(batch.action_out)
        for i in range(batch.in_answer_words.shape[0]):
            seq_len = min(options.max_answer_len, batch.action_length[i] -
                          1)  # don't change stop token '</s>'
            for j in range(seq_len):
                if greedy_output[i][j] != 0 and random.random() < flipp:
                    sample_output[i, j] = greedy_output[i][j]
        sample_output = sample_output.tolist()

        rl_inputs = []
        rl_outputs = []
        rl_input_lengths = []
        reward = []
        for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)):
            sout, slex = self.word_vocab.getLexical(sout)
            gout, glex = self.word_vocab.getLexical(gout)
            rl_inputs.append([int(batch.gen_input_words[i, 0])] + sout[:-1])
            rl_outputs.append(sout)
            rl_input_lengths.append(len(sout))
            _, ref_lex = self.word_vocab.getLexical(gold_output[i])
            slst = slex.split()
            glst = glex.split()
            rlst = ref_lex.split()
            reward.append(r - b)

        rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs),
                                              self.options.max_answer_len)
        rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs),
                                               self.options.max_answer_len)
        rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32)
        reward = np.array(reward, dtype=np.float32)
        assert rl_inputs.shape == rl_outputs.shape

        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.reward] = reward
        feed_dict[self.gen_input_words] = rl_inputs
        feed_dict[self.in_answer_words] = rl_outputs
        feed_dict[self.answer_lengths] = rl_input_lengths

        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

Ejemplo n.º 5

0

Mostrar archivo

    def run_rl_training_subsample(self, sess, batch, options):
        flipp = options.flipp if options.__dict__.has_key('flipp') else 0.1

        # make feed_dict
        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.answer_inp] = batch.sent_inp

        # get greedy and gold outputs
        greedy_output = sess.run(self.greedy_words,
                                 feed_dict)  # [batch, sent_len]
        greedy_output = greedy_output.tolist()
        gold_output = batch.sent_out.tolist()

        # generate sample_output by flipping coins
        sample_output = np.copy(batch.sent_out)
        for i in range(batch.sent_out.shape[0]):
            seq_len = min(options.max_answer_len, batch.sent_len[i] -
                          1)  # don't change stop token '</s>'
            for j in range(seq_len):
                if greedy_output[i][j] != 0 and random.random() < flipp:
                    sample_output[i, j] = greedy_output[i][j]
        sample_output = sample_output.tolist()

        st_wid = self.word_vocab.getIndex('<s>')
        en_wid = self.word_vocab.getIndex('</s>')

        rl_inputs = []
        rl_outputs = []
        rl_input_lengths = []
        reward = []
        for i, (sout, gout) in enumerate(zip(sample_output, greedy_output)):
            sout, slex = self.word_vocab.getLexical(sout)
            gout, glex = self.word_vocab.getLexical(gout)
            rl_inputs.append([
                st_wid,
            ] + sout[:-1])
            rl_outputs.append(sout)
            rl_input_lengths.append(len(sout))
            _, ref_lex = self.word_vocab.getLexical(gold_output[i])
            slst = slex.split()
            glst = glex.split()
            rlst = ref_lex.split()
            if options.reward_type == 'bleu':
                r = sentence_bleu([rlst], slst, smoothing_function=cc.method3)
                b = sentence_bleu([rlst], glst, smoothing_function=cc.method3)
            elif options.reward_type == 'rouge':
                r = sentence_rouge(ref_lex,
                                   slex,
                                   smoothing_function=cc.method3)
                b = sentence_rouge(ref_lex,
                                   glex,
                                   smoothing_function=cc.method3)
            reward.append(r - b)
            #print('Ref: {}'.format(ref_lex.encode('utf-8','ignore')))
            #print('Sample: {}'.format(slex.encode('utf-8','ignore')))
            #print('Greedy: {}'.format(glex.encode('utf-8','ignore')))
            #print('R-B: {}'.format(reward[-1]))
            #print('-----')

        rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs),
                                              self.options.max_answer_len)
        rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs),
                                               self.options.max_answer_len)
        rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32)
        reward = np.array(reward, dtype=np.float32)
        assert rl_inputs.shape == rl_outputs.shape

        feed_dict = self.run_encoder(sess, batch, options, only_feed_dict=True)
        feed_dict[self.reward] = reward
        feed_dict[self.answer_inp] = rl_inputs
        feed_dict[self.answer_ref] = rl_outputs
        feed_dict[self.answer_len] = rl_input_lengths

        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

Ejemplo n.º 6

0

Mostrar archivo

    def __init__(self,
                 instances,
                 options,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None):
        self.options = options

        self.instances = instances
        self.batch_size = len(instances)
        self.vocab = word_vocab

        self.passage_words = [
            instances[i][0].tokText.split() for i in range(self.batch_size)
        ]

        self.has_sent3 = False
        if instances[0][2] is not None: self.has_sent3 = True

        # create length
        self.sent1_length = []  # [batch_size]
        self.sent2_length = []  # [batch_size]
        if self.has_sent3: self.sent3_length = []  # [batch_size]
        for (sent1, sent2, sent3) in instances:
            self.sent1_length.append(sent1.get_length())
            self.sent2_length.append(sent2.get_length())
            if self.has_sent3: self.sent3_length.append(sent3.get_length())
        self.sent1_length = np.array(self.sent1_length, dtype=np.int32)
        self.sent2_length = np.array(self.sent2_length, dtype=np.int32)
        if self.has_sent3:
            self.sent3_length = np.array(self.sent3_length, dtype=np.int32)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')
        if options.with_word:
            self.sent1_word = []  # [batch_size, sent1_len]
            self.sent2_word = []  # [batch_size, sent2_len]
            self.sent2_input_word = []
            if self.has_sent3: self.sent3_word = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_word.append(sent1.word_idx_seq)
                self.sent2_word.append(sent2.word_idx_seq)
                self.sent2_input_word.append([start_id] +
                                             sent2.word_idx_seq[:-1])
                if self.has_sent3: self.sent3_word.append(sent3.word_idx_seq)
            self.sent1_word = padding_utils.pad_2d_vals_no_size(
                self.sent1_word)
            self.sent2_word = padding_utils.pad_2d_vals(
                self.sent2_word, len(self.sent2_word), options.max_answer_len)
            self.sent2_input_word = padding_utils.pad_2d_vals(
                self.sent2_input_word, len(self.sent2_input_word),
                options.max_answer_len)
            if self.has_sent3:
                self.sent3_word = padding_utils.pad_2d_vals_no_size(
                    self.sent3_word)

            self.in_answer_words = self.sent2_word
            self.gen_input_words = self.sent2_input_word
            self.answer_lengths = self.sent2_length

        if options.with_char:
            self.sent1_char = []  # [batch_size, sent1_len]
            self.sent2_char = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_char = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_char.append(sent1.char_idx_seq)
                self.sent2_char.append(sent2.char_idx_seq)
                if self.has_sent3: self.sent3_char.append(sent3.char_idx_seq)
            self.sent1_char = padding_utils.pad_3d_vals_no_size(
                self.sent1_char)
            self.sent2_char = padding_utils.pad_3d_vals_no_size(
                self.sent2_char)
            if self.has_sent3:
                self.sent3_char = padding_utils.pad_3d_vals_no_size(
                    self.sent3_char)

        if options.with_POS:
            self.sent1_POS = []  # [batch_size, sent1_len]
            self.sent2_POS = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_POS = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_POS.append(sent1.POS_idx_seq)
                self.sent2_POS.append(sent2.POS_idx_seq)
                if self.has_sent3: self.sent3_POS.append(sent3.POS_idx_seq)
            self.sent1_POS = padding_utils.pad_2d_vals_no_size(self.sent1_POS)
            self.sent2_POS = padding_utils.pad_2d_vals_no_size(self.sent2_POS)
            if self.has_sent3:
                self.sent3_POS = padding_utils.pad_2d_vals_no_size(
                    self.sent3_POS)

        if options.with_NER:
            self.sent1_NER = []  # [batch_size, sent1_len]
            self.sent2_NER = []  # [batch_size, sent2_len]
            if self.has_sent3: self.sent3_NER = []  # [batch_size, sent3_len]
            for (sent1, sent2, sent3) in instances:
                self.sent1_NER.append(sent1.NER_idx_seq)
                self.sent2_NER.append(sent2.NER_idx_seq)
                if self.has_sent3: self.sent3_NER.append(sent3.NER_idx_seq)
            self.sent1_NER = padding_utils.pad_2d_vals_no_size(self.sent1_NER)
            self.sent2_NER = padding_utils.pad_2d_vals_no_size(self.sent2_NER)
            if self.has_sent3:
                self.sent3_NER = padding_utils.pad_2d_vals_no_size(
                    self.sent3_NER)

        if options.with_phrase_projection:
            self.build_phrase_vocabs()
            if options.pretrain_with_max_matching and options.with_target_lattice:
                (_, prediction_lengths, generator_input_idx,
                 generator_output_idx) = self.sample_a_partition(
                     max_matching=True)
                self.in_answer_words = generator_output_idx
                self.gen_input_words = generator_input_idx
                self.answer_lengths = prediction_lengths

Ejemplo n.º 7

0

Mostrar archivo

Archivo: G2S_data_stream.py Proyecto: sinhvtr/graph2seq_nldb

    def __init__(self, instances, options, word_vocab=None):
        self.options = options

        self.amr_node = [x[0] for x in instances]
        self.id = [x[-1] for x in instances]
        self.target_ref = [x[-2] for x in instances]  # list of tuples
        self.batch_size = len(instances)
        self.vocab = word_vocab

        # create length
        self.node_num = []  # [batch_size]
        self.sent_len = []  # [batch_size]
        for (nodes_idx, nodes_chars_idx, in_neigh_indices, in_neigh_edges_idx,
             out_neigh_indices, out_neigh_edges_idx, sentence_idx, sentence,
             id) in instances:
            self.node_num.append(len(nodes_idx))
            self.sent_len.append(
                min(len(sentence_idx) + 1, options.max_answer_len))
        self.node_num = np.array(self.node_num, dtype=np.int32)
        self.sent_len = np.array(self.sent_len, dtype=np.int32)

        # node char num
        if options.with_char:
            self.nodes_chars_num = [[
                len(nodes_chars_idx) for nodes_chars_idx in instance[1]
            ] for instance in instances]
            self.nodes_chars_num = padding_utils.pad_2d_vals_no_size(
                self.nodes_chars_num)

        # neigh mask
        self.in_neigh_mask = []  # [batch_size, node_num, neigh_num]
        self.out_neigh_mask = []
        for instance in instances:
            ins = []
            for in_neighs in instance[2]:
                ins.append([1 for _ in in_neighs])
            self.in_neigh_mask.append(ins)
            outs = []
            for out_neighs in instance[4]:
                outs.append([1 for _ in out_neighs])
            self.out_neigh_mask.append(outs)
        self.in_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_mask)
        self.out_neigh_mask = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_mask)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')

        self.nodes = [x[0] for x in instances]
        if options.with_char:
            self.nodes_chars = [inst[1] for inst in instances
                                ]  # [batch_size, sent_len, char_num]
        self.in_neigh_indices = [x[2] for x in instances]
        self.in_neigh_edges = [x[3] for x in instances]
        self.out_neigh_indices = [x[4] for x in instances]
        self.out_neigh_edges = [x[5] for x in instances]

        self.sent_inp = []
        self.sent_out = []
        for _, _, _, _, _, _, sentence_idx, sentence, id in instances:
            if len(sentence_idx) < options.max_answer_len:
                self.sent_inp.append([
                    start_id,
                ] + sentence_idx)
                self.sent_out.append(sentence_idx + [
                    end_id,
                ])
            else:
                self.sent_inp.append([
                    start_id,
                ] + sentence_idx[:-1])
                self.sent_out.append(sentence_idx)

        # making ndarray
        self.nodes = padding_utils.pad_2d_vals_no_size(self.nodes)
        if options.with_char:
            self.nodes_chars = padding_utils.pad_3d_vals_no_size(
                self.nodes_chars)
        self.in_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_indices)
        self.in_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.in_neigh_edges)
        self.out_neigh_indices = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_indices)
        self.out_neigh_edges = padding_utils.pad_3d_vals_no_size(
            self.out_neigh_edges)

        assert self.in_neigh_mask.shape == self.in_neigh_indices.shape
        assert self.in_neigh_mask.shape == self.in_neigh_edges.shape
        assert self.out_neigh_mask.shape == self.out_neigh_indices.shape
        assert self.out_neigh_mask.shape == self.out_neigh_edges.shape

        # [batch_size, sent_len_max]
        self.sent_inp = padding_utils.pad_2d_vals(self.sent_inp,
                                                  len(self.sent_inp),
                                                  options.max_answer_len)
        self.sent_out = padding_utils.pad_2d_vals(self.sent_out,
                                                  len(self.sent_out),
                                                  options.max_answer_len)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: NP2P_data_stream.py Proyecto: sinhvtr/graph2seq_nldb

    def __init__(self,
                 instances,
                 options,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None):
        self.options = options

        self.batch_size = len(instances)
        self.vocab = word_vocab

        self.id = [inst[-1] for inst in instances]
        self.source = [inst[-3] for inst in instances]
        self.target_ref = [inst[-2] for inst in instances]

        # create length
        self.sent1_length = []  # [batch_size]
        self.sent2_length = []  # [batch_size]
        for (sent1_idx, sent2_idx, _, _, _) in instances:
            self.sent1_length.append(len(sent1_idx))
            self.sent2_length.append(
                min(len(sent2_idx) + 1, options.max_answer_len))
        self.sent1_length = np.array(self.sent1_length, dtype=np.int32)
        self.sent2_length = np.array(self.sent2_length, dtype=np.int32)

        # create word representation
        start_id = word_vocab.getIndex('<s>')
        end_id = word_vocab.getIndex('</s>')
        if options.with_word:
            self.sent1_word = []  # [batch_size, sent1_len]
            self.sent2_word = []  # [batch_size, sent2_len]
            self.sent2_input_word = []
            for (sent1_idx, sent2_idx, _, _, _) in instances:
                self.sent1_word.append(sent1_idx)
                self.sent2_word.append(sent2_idx + [end_id])
                self.sent2_input_word.append([start_id] + sent2_idx)
            self.sent1_word = padding_utils.pad_2d_vals(
                self.sent1_word, len(instances), np.max(self.sent1_length))
            self.sent2_word = padding_utils.pad_2d_vals(
                self.sent2_word, len(instances), options.max_answer_len)
            self.sent2_input_word = padding_utils.pad_2d_vals(
                self.sent2_input_word, len(instances), options.max_answer_len)

            self.in_answer_words = self.sent2_word
            self.gen_input_words = self.sent2_input_word
            self.answer_lengths = self.sent2_length

        if options.with_char:
            self.sent1_char = []  # [batch_size, sent1_len]
            self.sent1_char_lengths = []
            for (_, _, sent1, sent2, _) in instances:
                sent1_char_idx = char_vocab.to_character_matrix_for_list(
                    sent1.split()[:options.max_passage_len])
                self.sent1_char.append(sent1_char_idx)
                self.sent1_char_lengths.append(
                    [len(x) for x in sent1_char_idx])
            self.sent1_char = padding_utils.pad_3d_vals_no_size(
                self.sent1_char)
            self.sent1_char_lengths = padding_utils.pad_2d_vals_no_size(
                self.sent1_char_lengths)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: NP2P_data_stream.py Proyecto: xiaochang13/CacheTransition-Seq2seq

    def __init__(self, instances, options, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None):
        self.options = options

        self.instances = instances
        self.batch_size = len(instances)
        self.action_vocab = action_vocab
        self.feat_vocab = feat_vocab  # Added the feature indexer as batch attributes.

        # create length
        self.input_length = [] # [batch_size]
        self.concept_length = [] # [batch_size]
        self.action_length = [] # [batch_size]
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.input_length.append(input_sent.get_length()+1)
            self.concept_length.append(len(concepts_idx)+1)
            self.action_length.append(min(options.max_answer_len,len(actions_idx)))
        self.input_length = np.array(self.input_length, dtype=np.int32)
        self.concept_length = np.array(self.concept_length, dtype=np.int32)
        self.action_length = np.array(self.action_length, dtype=np.int32)

        start_id = action_vocab.getIndex('<s>')
        self.action_inp = []
        self.action_ref = []
        self.feats = []
        self.action2cid = []
        self.action2wid = []
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.action_inp.append([start_id,]+actions_idx[:-1])
            self.action_ref.append(actions_idx)
            self.feats.append(feats_idx)
            self.action2cid.append(action2cid)
            self.action2wid.append(action2wid)
        self.action_inp = padding_utils.pad_2d_vals(self.action_inp, len(self.action_inp), options.max_answer_len)
        self.action_ref = padding_utils.pad_2d_vals(self.action_ref, len(self.action_ref), options.max_answer_len)
        self.feats = padding_utils.pad_3d_vals(self.feats, len(self.feats), options.max_answer_len, len(self.feats[0][0]))
        self.action2cid = padding_utils.pad_2d_vals(self.action2cid, len(self.action2cid), options.max_answer_len)
        self.action2wid = padding_utils.pad_2d_vals(self.action2wid, len(self.action2wid), options.max_answer_len)

        append_id = word_vocab.getIndex('-NULL-')
        self.input_word = [] # [batch_size, sent_len]
        self.concept_word = [] # [batch_size, sent_len]
        for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
            self.input_word.append(input_sent.word_idx_seq+[append_id,])
            self.concept_word.append(concepts_idx+[append_id,])
        self.input_word = padding_utils.pad_2d_vals_no_size(self.input_word)
        self.concept_word = padding_utils.pad_2d_vals_no_size(self.concept_word)

        if options.with_lemma:
            self.input_lemma = []
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_lemma.append(input_sent.lemma_idx_seq+[append_id,])
            self.input_lemma = padding_utils.pad_2d_vals_no_size(self.input_lemma)

        if options.with_char:
            assert False
            self.input_char = [] # [batch_size, sent_len, char_size]
            self.input_char_len = [] # [batch_size, sent_len]
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_char.append(input_sent.char_idx_matrix)
                self.input_char_len.append([len(x) for x in input_sent.tok])
            self.input_char = padding_utils.pad_3d_vals_no_size(self.input_char)
            self.input_char_len = padding_utils.pad_2d_vals_no_size(self.input_char_len)

        if options.with_POS:
            append_pos_id = POS_vocab.getIndex('-NULL-')
            self.input_POS = [] # [batch_size, sent1_len]
            for (input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid) in instances:
                self.input_POS.append(input_sent.POS_idx_seq+[append_pos_id,])
            self.input_POS = padding_utils.pad_2d_vals_no_size(self.input_POS)