Beispiel #1
0
    def encode(self, sequence, emb=None):
        if emb is None:
            emb = self.emb

        #--for debug
        sequence, sequence_length = melt.pad(
            sequence,
            start_id=(vocabulary.vocab.start_id()
                      if FLAGS.encode_start_mark else None),
            end_id=(self.end_id if FLAGS.encode_end_mark else None))

        #for attention due to float32 numerice accuracy problem, may has some diff, so not slice it
        #if self.is_predict:
        #  num_steps = tf.cast(tf.reduce_max(sequence_length), dtype=tf.int32)
        #  sequence = tf.slice(sequence, [0,0], [-1, num_steps])

        inputs = tf.nn.embedding_lookup(emb, sequence)
        if self.is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        output_method = FLAGS.rnn_output_method
        encode_feature, state = melt.rnn.encode(self.cell,
                                                inputs,
                                                sequence_length,
                                                cell_bw=self.bwcell,
                                                encode_method=FLAGS.rnn_method,
                                                output_method=output_method)

        return encode_feature, state
Beispiel #2
0
def _decode(example, parse, dynamic_batch_length):
    features = parse(example,
                     features={
                         'image_name': tf.FixedLenFeature([], tf.string),
                         'url': tf.FixedLenFeature([], tf.string),
                         'text_str': tf.FixedLenFeature([], tf.string),
                         'ct0_str': tf.FixedLenFeature([], tf.string),
                         'title_str': tf.FixedLenFeature([], tf.string),
                         'real_title_str': tf.FixedLenFeature([], tf.string),
                         'text': tf.VarLenFeature(tf.int64),
                         'ct0': tf.VarLenFeature(tf.int64),
                         'title': tf.VarLenFeature(tf.int64),
                         'real_title': tf.VarLenFeature(tf.int64),
                     })

    image_name = features['image_name']
    text = features['text']
    input_type = 'real_title'
    input_text = features[input_type]

    maxlen = 0 if dynamic_batch_length else TEXT_MAX_WORDS
    text = melt.sparse_tensor_to_dense(text, maxlen)

    text, _ = melt.pad(text,
                       start_id=get_decoder_start_id(),
                       end_id=get_decoder_end_id())

    input_maxlen = 0 if dynamic_batch_length else INPUT_TEXT_MAX_WORDS
    input_text = melt.sparse_tensor_to_dense(input_text, maxlen)

    input_text, _ = melt.pad(
        input_text,
        start_id=(vocabulary.vocab.start_id()
                  if FLAGS.encode_start_mark else None),
        end_id=(encoder_end_id if FLAGS.encode_end_mark else None))

    text_str = features['text_str']
    input_text_str = features['{}_str'.format(input_type)]

    return image_name, text, text_str, input_text, input_text_str
Beispiel #3
0
def adjust(features, subset):
    if 'hist_len' not in features:
        try:
            features['hist_len'] = mt.length(features['history'])
        except Exception:
            features['hist_len'] = tf.ones_like(features['did'])

    if FLAGS.max_history:
        for key in features:
            if 'history' in key:
                max_history = FLAGS.max_history
                if 'enti' in key:
                    max_history *= 2
                if not FLAGS.fixed_pad:
                    features[key] = features[key][:, :max_history]
                else:
                    features[key] = mt.pad(features[key], max_history)

    # 注意按照nid去获取新闻测信息 did只是用作id特征 可能被mask
    features['ori_did'] = features['did']
    features['ori_history'] = features['history']
    if 'impressions' in features:
        features['ori_impressions'] = features['impressions']

    features['did'] = mask_dids(features['did'], features['did_in_train'],
                                subset, FLAGS.test_all_mask)

    features['uid'] = mask_uids(features['uid'], subset == 'train')

    if 'history' in features:
        features['history'] = unk_aug(features['history'], subset == 'train')

    mask_negative_weights(features, subset == 'train')

    vs = gezi.get('vocab_sizes')
    if FLAGS.min_count_unk and FLAGS.min_count:
        features['uid'] = get_id(features['uid'], vs['uid'][1])
        features['did'] = get_id(features['did'], vs['did'][1])
        if FLAGS.mask_history:
            features['history'] = get_id(features['history'], vs['did'][1])
        if 'impressions' in features:
            features['impressions'] = get_id(features['impressions'],
                                             vs['did'][1])

    if vs['uid'][1] < vs['uid'][0]:
        features['uid'] = get_id(features['uid'], vs['uid'][1])

    return features
Beispiel #4
0
    def build_predict_text_graph(self,
                                 input_text,
                                 decode_method=0,
                                 beam_size=5,
                                 convert_unk=True):
        with tf.variable_scope("encode"):
            encoder_output, state = self.encoder.encode(input_text)
            if not FLAGS.use_attention:
                encoder_output = None
        with tf.variable_scope("decode"):
            #---try to use static shape if possible
            batch_size = melt.get_batch_size(input_text)
            decoder_input = self.decoder.get_start_embedding_input(batch_size)
            max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS
            if decode_method == SeqDecodeMethod.greedy:
                input_text = self.encoder.sequence
                return self.decoder.generate_sequence_greedy(
                    decoder_input,
                    max_words=max_words,
                    initial_state=state,
                    attention_states=encoder_output,
                    convert_unk=convert_unk,
                    input_text=input_text)
            else:
                if decode_method == SeqDecodeMethod.beam:
                    decode_func = self.decoder.generate_sequence_beam
                elif decode_method == SeqDecodeMethod.beam_search:
                    decode_func = self.decoder.generate_sequence_beam_search
                else:
                    raise ValueError('not supported decode_method: %d' %
                                     decode_method)

                input_text, input_text_length = melt.pad(
                    input_text, end_id=self.encoder.end_id)
                #input_text = self.encoder.sequence
                #input_text_length = self.encoder.sequence_length
                return decode_func(decoder_input,
                                   max_words=max_words,
                                   initial_state=state,
                                   attention_states=encoder_output,
                                   beam_size=beam_size,
                                   convert_unk=convert_unk,
                                   length_normalization_factor=FLAGS.
                                   length_normalization_factor,
                                   input_text=input_text,
                                   input_text_length=input_text_length)
Beispiel #5
0
    def build_predict_text_graph(self,
                                 image_feature,
                                 input_text,
                                 decode_method='greedy',
                                 beam_size=5,
                                 convert_unk=True):
        with tf.variable_scope("encode"):
            image_emb = self.build_image_embeddings(image_feature)
            encoder_output, state = self.encoder.encode(input_text,
                                                        input=image_emb)
            #encoder_output, state = self.encoder.encode(input_text, input=None)
            if not FLAGS.use_attention:
                encoder_output = None
        with tf.variable_scope("decode"):
            batch_size = tf.shape(input_text)[0]
            decoder_input = self.decoder.get_start_embedding_input(batch_size)
            max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS
            if decode_method == SeqDecodeMethod.greedy:
                return self.decoder.generate_sequence_greedy(
                    decoder_input,
                    max_words=max_words,
                    initial_state=state,
                    attention_states=encoder_output,
                    convert_unk=convert_unk)
            else:
                if decode_method == SeqDecodeMethod.ingraph_beam:
                    decode_func = self.decoder.generate_sequence_ingraph_beam
                elif decode_method == SeqDecodeMethod.outgraph_beam:
                    decode_func = self.decoder.generate_sequence_outgraph_beam
                else:
                    raise ValueError('not supported decode_method: %s' %
                                     decode_method)

                input_text, input_text_length = melt.pad(
                    input_text, end_id=self.encoder.end_id)
                return decode_func(decoder_input,
                                   max_words=max_words,
                                   initial_state=state,
                                   attention_states=encoder_output,
                                   beam_size=beam_size,
                                   convert_unk=convert_unk,
                                   length_normalization_factor=FLAGS.
                                   length_normalization_factor,
                                   input_text=input_text,
                                   input_text_length=input_text_length)
Beispiel #6
0
 def pad(self, sequence):
     return melt.pad(
         sequence,
         start_id=(vocabulary.vocab.start_id()
                   if FLAGS.encode_start_mark else None),
         end_id=(self.end_id if FLAGS.encode_end_mark else None))
Beispiel #7
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for img2text/showandtell input is image_embedding, inital_state is None/zero set
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = tf.shape(sequence)[0]

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #TODO different init state as show in ptb_word_lm
        state = self.cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        if attention_states is None:
            outputs, state = tf.nn.dynamic_rnn(self.cell,
                                               inputs,
                                               initial_state=state,
                                               sequence_length=sequence_length,
                                               scope=self.scope)
            self.final_state = state
        else:
            attention_keys, attention_values, attention_score_fn, attention_construct_fn = \
              self.prepare_attention(attention_states)
            decoder_fn_train = melt.seq2seq.attention_decoder_fn_train(
                encoder_state=state,
                attention_keys=attention_keys,
                attention_values=attention_values,
                attention_score_fn=attention_score_fn,
                attention_construct_fn=attention_construct_fn)
            decoder_outputs_train, decoder_state_train, _ = \
                          melt.seq2seq.dynamic_rnn_decoder(
                              cell=self.cell,
                              decoder_fn=decoder_fn_train,
                              inputs=inputs,
                              sequence_length=tf.to_int32(sequence_length),
                              scope=self.scope)
            outputs = decoder_outputs_train

            self.final_state = decoder_state_train

        tf.add_to_collection('outputs', outputs)

        #[batch_size, num_steps]
        targets = sequence

        if FLAGS.copy_only:
            #TODO now not work!
            attention_scores = tf.get_collection('attention_scores')[-1]
            indices = melt.batch_values_to_indices(input_text)
            #logits = ;
        else:
            #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
            softmax_loss_function = self.softmax_loss_function
            if self.is_predict and (exact_prob or exact_loss):
                softmax_loss_function = None

            if softmax_loss_function is None:
                #[batch_size, num_steps, num_units] * [num_units, vocab_size]
                # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
                #or [batch_size * num_steps, vocab_size] by default flatten=True
                keep_dims = exact_prob or exact_loss
                logits = melt.batch_matmul_embedding(
                    outputs, self.w, keep_dims=keep_dims) + self.v
                if not keep_dims:
                    targets = tf.reshape(targets, [-1])
            else:
                logits = outputs

            mask = tf.cast(tf.sign(targets), dtype=tf.float32)

            if self.is_predict and exact_prob:
                #generate real prob for sequence
                #for 10w vocab textsum seq2seq 20 -> 4 about
                loss = melt.seq2seq.exact_predict_loss(logits, targets, mask,
                                                       num_steps, batch_size)
            elif self.is_predict and exact_loss:
                #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
                #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
                #but 256 vocab sample will use only about 10ms
                #TODO check more with softmax loss and sampled somtmax loss, check length normalize
                loss = melt.seq2seq.sequence_loss_by_example(logits,
                                                             targets,
                                                             weights=mask)
            else:
                #loss [batch_size,]
                loss = melt.seq2seq.sequence_loss_by_example(
                    logits,
                    targets,
                    weights=mask,
                    softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses]
        loss = tf.reshape(loss, [-1, 1])

        if self.is_predict:
            loss = self.normalize_length(loss, sequence_length, exact_prob)
            #loss = tf.squeeze(loss)  TODO: later will uncomment this with all models rerun
        return loss
Beispiel #8
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for img2text/showandtell input is image_embedding, inital_state is None/zero set
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = tf.shape(sequence)[0]

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(attention_states,
                                          initial_state=initial_state)
            #initial_state = None
            initial_state = cell.zero_state(batch_size, tf.float32)
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        #if attention_states is None:
        #-----TODO using attention_wrapper works now with dynamic_rnn but still slower then old attention method...
        outputs, state = tf.nn.dynamic_rnn(cell,
                                           inputs,
                                           initial_state=state,
                                           sequence_length=sequence_length,
                                           dtype=tf.float32,
                                           scope=self.scope)
        #else:
        #  #---below is also ok but slower, above 16+ ,below only 13,14 batch/s, may be due to sample id
        #  #TODO: can we make below code as fast as tf.nn.dyanmic_rnn if not need smaple id remove it ?
        #  #FIXME... AttentionWrapper is only 1/2 speed comapred to old function based attention, why?
        #  #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
        #  helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
        #  #my_decoder = tf.contrib.seq2seq.BasicDecoder(
        #  my_decoder = melt.seq2seq.BasicTrainingDecoder(
        #      cell=cell,
        #      helper=helper,
        #      initial_state=state)
        #  outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
        #  #outputs = outputs.rnn_output

        self.final_state = state

        tf.add_to_collection('outputs', outputs)

        #[batch_size, num_steps]
        targets = sequence

        if FLAGS.copy_only:
            #TODO now not work!
            attention_scores = tf.get_collection('attention_scores')[-1]
            indices = melt.batch_values_to_indices(input_text)
            #logits = ;
        else:
            #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
            softmax_loss_function = self.softmax_loss_function
            if self.is_predict and (exact_prob or exact_loss):
                softmax_loss_function = None

            if softmax_loss_function is None:
                #[batch_size, num_steps, num_units] * [num_units, vocab_size]
                # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
                #or [batch_size * num_steps, vocab_size] by default flatten=True
                keep_dims = exact_prob or exact_loss
                logits = melt.batch_matmul_embedding(
                    outputs, self.w, keep_dims=keep_dims) + self.v
                if not keep_dims:
                    targets = tf.reshape(targets, [-1])
            else:
                logits = outputs

            mask = tf.cast(tf.sign(targets), dtype=tf.float32)

            if self.is_predict and exact_prob:
                #generate real prob for sequence
                #for 10w vocab textsum seq2seq 20 -> 4 about
                loss = melt.seq2seq.exact_predict_loss(logits, targets, mask,
                                                       num_steps, batch_size)
            elif self.is_predict and exact_loss:
                #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
                #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
                #but 256 vocab sample will use only about 10ms
                #TODO check more with softmax loss and sampled somtmax loss, check length normalize
                loss = melt.seq2seq.sequence_loss_by_example(logits,
                                                             targets,
                                                             weights=mask)
            else:
                #loss [batch_size,]
                loss = melt.seq2seq.sequence_loss_by_example(
                    logits,
                    targets,
                    weights=mask,
                    softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses]
        loss = tf.reshape(loss, [-1, 1])

        if self.is_predict:
            loss = self.normalize_length(loss, sequence_length, exact_prob)
            #loss = tf.squeeze(loss)  TODO: later will uncomment this with all models rerun
        return loss
Beispiel #9
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for img2text/showandtell input is image_embedding, inital_state is None/zero set
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = melt.get_batch_size(sequence)

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        #[batch_size, num_steps]
        targets = sequence

        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        if FLAGS.gen_only:
            #gen only mode
            #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why
            outputs, state = tf.nn.dynamic_rnn(cell,
                                               inputs,
                                               initial_state=state,
                                               sequence_length=sequence_length,
                                               dtype=tf.float32,
                                               scope=self.scope)

            #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s
            #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            #my_decoder = melt.seq2seq.BasicTrainingDecoder(
            ##my_decoder = tf.contrib.seq2seq.BasicDecoder(
            ##my_decoder = melt.seq2seq.BasicDecoder(
            #      cell=cell,
            #      helper=helper,
            #      initial_state=state)
            ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            ##outputs = outputs.rnn_output
        else:
            #---copy only or gen copy
            helper = melt.seq2seq.TrainingHelper(inputs,
                                                 tf.to_int32(sequence_length))

            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn = lambda cell_output, cell_state: self.copy_output_fn(
                    indices, batch_size, cell_output, cell_state)
            else:
                #gen_copy right now, not use switch
                sampled_values = None
                if self.softmax_loss_function is not None:
                    sampled_values = tf.nn.log_uniform_candidate_sampler(
                        true_classes=tf.reshape(targets, [-1, 1]),
                        num_true=1,
                        num_sampled=self.num_sampled,
                        unique=True,
                        range_max=self.vocab_size)
                    #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values
                    #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much
                output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn(
                    time, indices, targets, sampled_values, batch_size,
                    cell_output, cell_state)

            my_decoder = melt.seq2seq.BasicTrainingDecoder(
                cell=cell,
                helper=helper,
                initial_state=state,
                vocab_size=self.vocab_size,
                output_fn=output_fn)
            outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)

        tf.add_to_collection('outputs', outputs)

        #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
        softmax_loss_function = self.softmax_loss_function
        if self.is_predict and (exact_prob or exact_loss):
            softmax_loss_function = None

        if not FLAGS.gen_only:
            logits = outputs
            softmax_loss_function = None
        elif softmax_loss_function is not None:
            logits = outputs
        else:
            #[batch_size, num_steps, num_units] * [num_units, vocab_size]
            # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
            #or [batch_size * num_steps, vocab_size] by default flatten=True
            keep_dims = exact_prob or exact_loss
            logits = melt.batch_matmul_embedding(
                outputs, self.w, keep_dims=keep_dims) + self.v
            if not keep_dims:
                targets = tf.reshape(targets, [-1])

        tf.add_to_collection('logits', logits)

        #if input_text is not None:
        #  logits = outputs

        mask = tf.cast(tf.sign(targets), dtype=tf.float32)

        if FLAGS.gen_copy_switch:
            #TODO why need more gpu mem ? ...  do not save logits ? just calc loss in output_fn ?
            #batch size 256
            #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body
            #step_logits = logits[:, i, :]
            #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470]
            num_steps = tf.shape(targets)[1]

            loss = melt.seq2seq.exact_predict_loss(logits,
                                                   targets,
                                                   mask,
                                                   num_steps,
                                                   need_softmax=False,
                                                   need_average=True,
                                                   batch_size=batch_size)

            # loss = melt.seq2seq.sequence_loss_by_example(
            #     logits,
            #     targets,
            #     weights=mask)
        elif self.is_predict and exact_prob:
            #generate real prob for sequence
            #for 10w vocab textsum seq2seq 20 -> 4 about
            loss = melt.seq2seq.exact_predict_loss(logits,
                                                   targets,
                                                   mask,
                                                   num_steps,
                                                   batch_size=batch_size)
        elif self.is_predict and exact_loss:
            #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
            #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
            #but 256 vocab sample will use only about 10ms
            loss = melt.seq2seq.sequence_loss_by_example(logits,
                                                         targets,
                                                         weights=mask)
        else:
            #loss [batch_size,]
            loss = melt.seq2seq.sequence_loss_by_example(
                logits,
                targets,
                weights=mask,
                softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses]
        loss = tf.reshape(loss, [-1, 1])

        if self.is_predict:
            loss = self.normalize_length(loss, sequence_length, exact_prob)
            #loss = tf.squeeze(loss)  TODO: later will uncomment this with all models rerun
        return loss
Beispiel #10
0
    def sequence_loss(self,
                      sequence,
                      initial_state=None,
                      attention_states=None,
                      input=None,
                      input_text=None,
                      exact_prob=False,
                      exact_loss=False,
                      emb=None):
        """
    for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder
    for showandtell input is image_embedding, inital_state is None/zero set, if use im2txt mode set image_as_init_state=True will do as above, need to PAD <GO> !
    TODO since exact_porb and exact_loss same value, may remove exact_prob
    NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!)
    """
        if emb is None:
            emb = self.emb

        is_training = self.is_training
        batch_size = melt.get_batch_size(sequence)

        sequence, sequence_length = melt.pad(sequence,
                                             start_id=self.get_start_id(),
                                             end_id=self.get_end_id())

        #[batch_size, num_steps - 1, emb_dim], remove last col
        inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1])

        if is_training and FLAGS.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)

        #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
        if input is not None:
            #used like showandtell where image_emb is as input, additional to sequence
            inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1)
        else:
            #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
            sequence_length -= 1
            sequence = sequence[:, 1:]

        if self.is_predict:
            #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
            num_steps = tf.to_int32(tf.reduce_max(sequence_length))
            sequence = sequence[:, :num_steps]
            inputs = inputs[:, :num_steps, :]

        tf.add_to_collection('sequence', sequence)
        tf.add_to_collection('sequence_length', sequence_length)

        #[batch_size, num_steps]
        targets = sequence

        if attention_states is None:
            cell = self.cell
        else:
            cell = self.prepare_attention(
                attention_states,
                initial_state=initial_state,
                score_as_alignment=self.score_as_alignment)
            initial_state = None
        state = cell.zero_state(
            batch_size, tf.float32) if initial_state is None else initial_state

        #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict
        softmax_loss_function = self.softmax_loss_function
        if self.is_predict and (exact_prob or exact_loss):
            softmax_loss_function = None

        scheduled_sampling_probability = FLAGS.scheduled_sampling_probability if self.is_training else 0.
        if FLAGS.gen_only:
            #gen only mode
            #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why
            if scheduled_sampling_probability > 0.:
                helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs, tf.to_int32(sequence_length), emb,
                    tf.constant(FLAGS.scheduled_sampling_probability))
                #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
                my_decoder = melt.seq2seq.BasicDecoder(
                    #my_decoder = tf.contrib.seq2seq.BasicDecoder(
                    #my_decoder = melt.seq2seq.BasicDecoder(
                    cell=cell,
                    helper=helper,
                    initial_state=state)
                outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder, scope=self.scope)
                #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
                outputs = outputs.rnn_output
            else:
                outputs, state = tf.nn.dynamic_rnn(
                    cell,
                    inputs,
                    initial_state=state,
                    sequence_length=sequence_length,
                    dtype=tf.float32,
                    scope=self.scope)

            #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s
            #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length))
            #my_decoder = melt.seq2seq.BasicTrainingDecoder(
            ##my_decoder = tf.contrib.seq2seq.BasicDecoder(
            ##my_decoder = melt.seq2seq.BasicDecoder(
            #      cell=cell,
            #      helper=helper,
            #      initial_state=state)
            ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            ##outputs = outputs.rnn_output
        else:
            #---copy only or gen copy
            if scheduled_sampling_probability > 0.:
                #not tested yet TODO
                helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs, tf.to_int32(sequence_length), emb,
                    tf.constant(FLAGS.scheduled_sampling_probability))
                Decoder_ = melt.seq2seq.BasicDecoder
            else:
                #as before
                helper = melt.seq2seq.TrainingHelper(
                    inputs, tf.to_int32(sequence_length))
                Decoder_ = melt.seq2seq.BasicTrainingDecoder

            indices = melt.batch_values_to_indices(tf.to_int32(input_text))
            if FLAGS.copy_only:
                output_fn = lambda cell_output, cell_state: self.copy_output_fn(
                    indices, batch_size, cell_output, cell_state)
            else:
                #gen_copy right now, not use switch ? gen_copy and switch?
                sampled_values = None
                #TODO CHECK this is it ok? why train and predict not equal and score/exact score same? FIXME
                #need first debug why score and exact score is same ? score should be the same as train! TODO
                #sh ./inference/infrence-score.sh to reproduce
                #now just set num_sampled = 0 for safe, may be here train also not correct FIXME
                if softmax_loss_function is not None:
                    sampled_values = tf.nn.log_uniform_candidate_sampler(
                        true_classes=tf.reshape(targets, [-1, 1]),
                        num_true=1,
                        num_sampled=self.num_sampled,
                        unique=True,
                        range_max=self.vocab_size)
                    #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values
                    #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much
                output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn(
                    time, indices, targets, sampled_values, batch_size,
                    cell_output, cell_state)

            my_decoder = Decoder_(cell=cell,
                                  helper=helper,
                                  initial_state=state,
                                  vocab_size=self.vocab_size,
                                  output_fn=output_fn)
            outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                my_decoder, scope=self.scope)
            #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope)
            if hasattr(outputs, 'rnn_output'):
                outputs = outputs.rnn_output

        tf.add_to_collection('outputs', outputs)

        if not FLAGS.gen_only:
            logits = outputs
            softmax_loss_function = None
        elif softmax_loss_function is not None:
            logits = outputs
        else:
            #--softmax_loss_function is None means num_sample = 0 or exact_loss or exact_prob
            #[batch_size, num_steps, num_units] * [num_units, vocab_size]
            # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss)
            #or [batch_size * num_steps, vocab_size] by default flatten=True
            #this will be fine for train [batch_size * num_steps] but not good for eval since we want
            #get score of each instance also not good for predict
            #--------only training mode not keep dims, but this will be dangerous, since class call rnn_decoder
            #need to manully set rnn_decoder.is_training=False!  TODO other wise will show incorrect scores in eval mode
            #but not affect the final model!
            keep_dims = exact_prob or exact_loss or (not self.is_training)
            logits = melt.batch_matmul_embedding(
                outputs, self.w, keep_dims=keep_dims) + self.v
            if not keep_dims:
                targets = tf.reshape(targets, [-1])

        tf.add_to_collection('logits', logits)

        mask = tf.cast(tf.sign(targets), dtype=tf.float32)

        if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax:
            #TODO why need more gpu mem ? ...  do not save logits ? just calc loss in output_fn ?
            #batch size 256
            #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body
            #step_logits = logits[:, i, :]
            #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470]
            num_steps = tf.shape(targets)[1]

            loss = melt.seq2seq.exact_predict_loss(
                logits,
                targets,
                mask,
                num_steps,
                need_softmax=False,
                average_across_timesteps=not self.is_predict,
                batch_size=batch_size)
        elif self.is_predict and exact_prob:
            #generate real prob for sequence
            #for 10w vocab textsum seq2seq 20 -> 4 about
            loss = melt.seq2seq.exact_predict_loss(
                logits,
                targets,
                mask,
                num_steps,
                batch_size=batch_size,
                average_across_timesteps=False)
        elif self.is_predict and exact_loss:
            #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq
            #NOTICE using time a bit less  55 to 57(prob), same result with exact prob and exact score
            #but 256 vocab sample will use only about 10ms
            loss = melt.seq2seq.sequence_loss_by_example(
                logits, targets, weights=mask, average_across_timesteps=False)
        else:
            #loss [batch_size,]
            loss = melt.seq2seq.sequence_loss_by_example(
                logits,
                targets,
                weights=mask,
                average_across_timesteps=not self.
                is_predict,  #train must average, other wise long sentence big loss..
                softmax_loss_function=softmax_loss_function)

        #mainly for compat with [bach_size, num_losses] here may be [batch_size * num_steps,] if is_training and not exact loss/prob
        loss = tf.reshape(loss, [-1, 1])

        self.ori_loss = loss
        if self.is_predict:
            #note use avg_loss not to change loss pointer, avg_loss is same as average time step=True is length_normalize_fator=1.0
            avg_loss = self.normalize_length(loss, sequence_length)
            return avg_loss

        #if not is_predict loss is averaged per time step else not but avg loss will average it
        return loss
Beispiel #11
0
  def sequence_loss(self, input, sequence, initial_state=None, emb=None):
    if emb is None:
      emb = self.emb
    
    is_training = self.is_training
    batch_size = tf.shape(sequence)[0]

    sequence, sequence_length = melt.pad(sequence,
                                     start_id=self.get_start_id(),
                                     end_id=self.get_end_id())

    #TODO different init state as show in ptb_word_lm
    state = self.cell.zero_state(batch_size, tf.float32) if initial_state is None else initial_state

    #[batch_size, num_steps - 1, emb_dim], remove last col
    inputs = tf.nn.embedding_lookup(emb, melt.dynamic_exclude_last_col(sequence))
    
    if is_training and FLAGS.keep_prob < 1:
      inputs = tf.nn.dropout(inputs, FLAGS.keep_prob)
    
    #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat
    if input is not None:
      #used like showandtell where image_emb is as input, additional to sequence
      inputs = tf.concat(1, [tf.expand_dims(input, 1), inputs])
    else:
      #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad
      sequence_length -= 1
      sequence = sequence[:, 1:]
    
    if self.is_predict:
      #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot
      num_steps = tf.cast(tf.reduce_max(sequence_length), dtype=tf.int32)
      sequence = tf.slice(sequence, [0,0], [-1, num_steps])

    outputs, state = tf.nn.dynamic_rnn(self.cell, inputs, 
                                       initial_state=state, 
                                       sequence_length=sequence_length)
    self.final_state = state
    
    if self.softmax_loss_function is None:
      #[batch_size, num_steps, num_units] * [num_units, vocab_size] -> logits [batch_size, num_steps, vocab_size]
      logits = melt.batch_matmul_embedding(outputs, self.w) + self.v
    else:
      logits = outputs

    #[batch_size, num_steps]
    targets = sequence
    mask = tf.cast(tf.sign(sequence), dtype=tf.float32)
    
    if self.is_predict and FLAGS.predict_no_sample:
      loss = melt.seq2seq.exact_predict_loss(logits, batch_size, num_steps)
    else:
      #loss [batch_size,] 
      loss = melt.seq2seq.sequence_loss_by_example(
          logits,
          targets,
          weights=mask,
          softmax_loss_function=self.softmax_loss_function)
    
    #mainly for compat with [bach_size, num_losses]
    loss = tf.reshape(loss, [-1, 1])
    if self.is_predict:
      loss = self.normalize_length(loss, sequence_length)
 
    return loss