Example #1
0
    def translate(self, source1_segments, source2_segments,
                  translation_settings):
        """
        Returns the translation of @param source_segments.
        """
        logging.info('Translating {0} segments...\n'.format(
            len(source1_segments)))
        n_samples, source1_sentences, source2_sentences = self._send_jobs(
            source1_segments, source2_segments, translation_settings)

        translations = []
        for i, trans in enumerate(
                self._retrieve_jobs(n_samples,
                                    translation_settings.request_id)):

            samples, scores, word_probs, alignment1, alignment2, hyp_graph = trans
            # n-best list
            if translation_settings.n_best is True:
                order = numpy.argsort(scores)
                n_best_list = []
                for j in order:
                    current_alignment1 = None if not translation_settings.get_alignment else alignment1[
                        j]
                    current_alignment2 = None if not translation_settings.get_alignment else alignment2[
                        j]
                    translation = Translation(
                        sentence_id=i,
                        source1_words=source1_sentences[i],
                        source2_words=source2_sentences[i],
                        target_words=seqs2words(samples[j],
                                                self._word_idict_trg,
                                                join=False),
                        score=scores[j],
                        alignment1=current_alignment1,
                        alignment2=current_alignment2,
                        target_probs=word_probs[j],
                        hyp_graph=hyp_graph,
                        hypothesis_id=j)
                    n_best_list.append(translation)
                translations.append(n_best_list)
            # single-best translation
            else:
                current_alignment1 = None if not translation_settings.get_alignment else alignment1
                current_alignment2 = None if not translation_settings.get_alignment else alignment2
                translation = Translation(sentence_id=i,
                                          source1_words=source1_sentences[i],
                                          source2_words=source2_sentences[i],
                                          target_words=seqs2words(
                                              samples,
                                              self._word_idict_trg,
                                              join=False),
                                          score=scores,
                                          alignment1=current_alignment1,
                                          alignment2=current_alignment2,
                                          target_probs=word_probs,
                                          hyp_graph=hyp_graph)
                translations.append(translation)
        return translations
Example #2
0
    def translate(self, source_segments, translation_settings, references=None, scorer=None):
        """
        Returns the translation of @param source_segments.
        """
        logging.info('Translating {0} segments...\n'.format(len(source_segments)))
        n_samples, source_sentences, ref_sentences = self._send_jobs(source_segments, translation_settings, ref_=references)

        translations = []
        for i, trans in enumerate(self._retrieve_jobs(n_samples, translation_settings.request_id)):

            samples, scores, word_probs, alignment, hyp_graph = trans
            # references for the counterfactual log
            reference = None
            if references is not None and scorer is not None:
                scorer.set_reference(ref_sentences[i])
                reference = ref_sentences[i]
            # n-best list
            if translation_settings.n_best is True:
                order = numpy.argsort(scores)
                n_best_list = []
                for j in order:
                    evaluation_score = None
                    target_words = seqs2words(samples[j], self._word_idict_trg, join=False)
                    if references is not None and scorer is not None:
                        evaluation_score = scorer.score(target_words)
                    current_alignment = None if not translation_settings.get_alignment else alignment[j]
                    translation = Translation(sentence_id=i,
                                              source_words=source_sentences[i],
                                              target_words=seqs2words(samples[j], self._word_idict_trg, join=False),
                                              score=scores[j],
                                              alignment=current_alignment,
                                              target_probs=word_probs[j],
                                              hyp_graph=hyp_graph,
                                              hypothesis_id=j,
                                              reference=reference,
                                              evaluation_score=evaluation_score)
                    n_best_list.append(translation)
                translations.append(n_best_list)
            # single-best translation
            else:
                current_alignment = None if not translation_settings.get_alignment else alignment
                evaluation_score = None
                target_words = seqs2words(samples, self._word_idict_trg, join=False)
                if references is not None and scorer is not None:
                    evaluation_score = scorer.score(target_words)
                translation = Translation(sentence_id=i,
                                          source_words=source_sentences[i],
                                          target_words=seqs2words(samples, self._word_idict_trg, join=False),
                                          score=scores,
                                          alignment=current_alignment,
                                          target_probs=word_probs,
                                          hyp_graph=hyp_graph,
                                          reference=reference,
                                          evaluation_score=evaluation_score)
                translations.append(translation)
        return translations
Example #3
0
 def score(self, x_or_y):
     if len(
             x_or_y.shape
     ) > 2:  # x shape: (1, N, M). y shape: (N, M)  todo: work with factors
         x_or_y = numpy.squeeze(x_or_y, axis=0)
     """
     Nematus is generally called on 1)Tokenized, 2)Truecased, 3)BPE data.
     So we will train KenLM on Tokenized, Truecase data.
     Therefore all we need to do is convert to a string and deBPE.
     """
     sentences = [
         deBPE(seqs2words(seq, self.id_to_word)) for seq in x_or_y.T
     ]
     scores = self.model.score(sentences)
     #try:
     #    print 'remote LM sentences/scores:'
     #    for sent, score in zip(sentences, scores):
     #        print '"'+sent+'":', score
     #except Exception, e:
     #    print 'failed to print LM sentences/scores', e
     return scores
Example #4
0
def process_examples(f_init,
                     f_next,
                     imgid,
                     contexts,
                     cnn_feats,
                     word_idict,
                     options,
                     k=4,
                     normalize=False,
                     debug=False):
    caps = []
    if len(cnn_feats) < len(contexts):
        for idx, ctx in enumerate(contexts):
            if options['with_glove']:
                ctx_cutoff = ctx[:options['cutoff'] *
                                 options['semantic_dim']].reshape([
                                     options['cutoff'], options['semantic_dim']
                                 ])
            else:
                ctx_cutoff = ctx[:options['cutoff']]
            if len(f_init) > 1 and len(f_next) > 1:
                sample, score, alpha = gen_sample_ensemble(f_init,
                                                           f_next,
                                                           ctx_cutoff,
                                                           cnn_feats[0],
                                                           options,
                                                           trng=trng,
                                                           k=k,
                                                           maxlen=30)
            else:
                sample, score, alpha = gen_sample(f_init[0],
                                                  f_next[0],
                                                  ctx_cutoff,
                                                  cnn_feats[0],
                                                  options,
                                                  trng=trng,
                                                  k=k,
                                                  maxlen=30)
            if normalize:
                lengths = numpy.array([len(s) for s in sample])
                score = score / lengths
            sidx = numpy.argmin(score)
            # write result into caption format
            caps.append({
                'image_id': imgid[idx],
                'caption': seqs2words(sample[sidx], word_idict)
            })
            if idx % 100 == 0:
                print 'Sample %d/%d' % (idx, len(contexts))
        return caps
    elif len(cnn_feats) == len(contexts):
        for idx, ctx, ctx_cnn in zip(range(len(contexts)), contexts,
                                     cnn_feats):
            if options['with_glove']:
                ctx_cutoff = ctx[:options['cutoff'] *
                                 options['semantic_dim']].reshape(
                                     (options['cutoff'],
                                      options['semantic_dim']))
            else:
                ctx_cutoff = ctx[:options['cutoff']]
            # generate the samples
            if len(f_init) > 1 and len(f_next) > 1:
                sample, score, alpha = gen_sample_ensemble(f_init,
                                                           f_next,
                                                           ctx_cutoff,
                                                           ctx_cnn,
                                                           options,
                                                           trng=trng,
                                                           k=k,
                                                           maxlen=30)
            else:
                sample, score, alpha = gen_sample(f_init[0],
                                                  f_next[0],
                                                  ctx_cutoff,
                                                  ctx_cnn,
                                                  options,
                                                  trng=trng,
                                                  k=k,
                                                  maxlen=30)
            if normalize:
                lengths = numpy.array([len(s) for s in sample])
                score = score / lengths
            sidx = numpy.argmin(score)
            # write result into caption format
            caption = seqs2words(sample[sidx], word_idict)
            caps.append({'image_id': imgid[idx], 'caption': caption})
            if idx % 100 == 0:
                print 'Sample %d/%d' % (idx, len(contexts))
            if debug:
                if idx < 6:
                    for word, weights in zip(caption.split(), alpha[sidx]):
                        print word, weights
                    print
                else:
                    quit()
        return caps
    else:
        raise ValueError(
            "The length of cnn features and contexts does not equal")
Example #5
0
    def translate(self,
                  source_segments,
                  translation_settings,
                  aux_source_segments=[]):
        """
        Returns the translation of @param source_segments (and @param aux_source_segments if multi-source)
        """
        logging.info('Translating {0} segments...\n'.format(
            len(source_segments)))
        if len(aux_source_segments) > 0:
            n_samples, multiple_source_sentences = self._send_jobs_multisource(
                source_segments, aux_source_segments, translation_settings)
        else:
            # TODO: make this one the generic send jobs
            n_samples, multiple_source_sentences = self._send_jobs_multisource(
                source_segments, [], translation_settings)

        #os.sys.stderr.write(str(translation_settings.predicted_trg)+"\n")

        translations = []

        for i, trans in enumerate(
                self._retrieve_jobs(n_samples,
                                    translation_settings.request_id)):

            # previous target sentence (take predicted previous sentence)
            if translation_settings.predicted_trg and aux_source_segments:
                if i == 0:
                    current_aux = "<START>"
                else:
                    os.sys.stderr.write("Using previous translation...")
                    current_aux = translations[i - 1]

            # just use the auxiliary input provided
            else:
                # handle potential multi-source input
                current_aux = [ss[i] for ss in multiple_source_sentences[1:]]

            samples, scores, word_probs, alignments, hyp_graph = trans

            # n-best list
            if translation_settings.n_best is True:
                order = numpy.argsort(scores)
                n_best_list = []
                for j in order:
                    current_alignment = None if not translation_settings.get_alignment else alignments[
                        0][j]

                    aux_current_alignments = []  # list for multi-source
                    for e in range(len(self.num_encoders - 1)):
                        aux_current_alignments.append(
                            None if not translation_settings.get_alignment else
                            alignments[e + 1][j])

                    translation = Translation(
                        sentence_id=i,
                        source_words=multiple_source_sentences[0][i],
                        target_words=seqs2words(samples[j],
                                                self._word_idict_trg,
                                                join=False),
                        score=scores[j],
                        alignment=current_alignment,
                        target_probs=word_probs[j],
                        hyp_graph=hyp_graph,
                        hypothesis_id=j,
                        aux_source_words=current_aux,  # list of extra inputs
                        aux_alignment=aux_current_alignments)
                    n_best_list.append(translation)
                translations.append(n_best_list)
            # single-best translation
            else:
                current_alignment = None if not translation_settings.get_alignment else alignments[
                    0]

                aux_current_alignments = []  # list for multi-source
                for e in range(self.num_encoders - 1):
                    aux_current_alignments.append(
                        None if not translation_settings.get_alignment else
                        alignments[e + 1])

                translation = Translation(
                    sentence_id=i,
                    source_words=multiple_source_sentences[0][i],
                    target_words=seqs2words(samples,
                                            self._word_idict_trg,
                                            join=False),
                    score=scores,
                    alignment=current_alignment,
                    target_probs=word_probs,
                    hyp_graph=hyp_graph,
                    aux_source_words=current_aux,  # list of extra inputs
                    aux_alignment=aux_current_alignments)
                translations.append(translation)
        return translations
def train(
    dim_word=300,  # word vector dimensionality
    ctx_dim=300,  # context vector dimensionality
    semantic_dim=300,
    dim=1000,  # the number of LSTM units
    cnn_dim=4096,  # CNN feature dimension
    n_layers_att=1,  # number of layers used to compute the attention weights
    n_layers_out=1,  # number of layers used to compute logit
    n_layers_lstm=1,  # number of lstm layers
    n_layers_init=1,  # number of layers to initialize LSTM at time 0
    lstm_encoder=True,  # if True, run bidirectional LSTM on input units
    prev2out=False,  # Feed previous word into logit
    ctx2out=False,  # Feed attention weighted ctx into logit
    cutoff=10,
    patience=5,
    max_epochs=30,
    dispFreq=500,
    decay_c=0.,  # weight decay coeff
    alpha_c=0.,  # doubly stochastic coeff
    lrate=1e-4,  # used only for SGD
    selector=False,  # selector (see paper)
    maxlen=30,  # maximum length of the description
    optimizer='rmsprop',
    pretrained='',
    batch_size=256,
    saveto='model',  # relative path of saved model file
    saveFreq=1000,  # save the parameters after every saveFreq updates
    sampleFreq=100,  # generate some samples after every sampleFreq updates
    embedding='../Data/GloVe/vocab_glove.pkl',
    cnn_type='vgg',
    prefix='../Data',  # path to find data
    dataset='coco',
    criterion='Bleu_4',
    switch_test_val=False,
    use_cnninit=True,
    use_dropout=True,  # setting this true turns on dropout at various points
    use_dropout_lstm=False,  # dropout on lstm gates
    save_per_epoch=False):  # this saves down the model every epoch

    # hyperparam dict
    model_options = locals().copy()
    model_options = validate_options(model_options)

    # reload options
    if os.path.exists('%s.pkl' % saveto):
        print "Reloading options"
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print "Using the following parameters:"
    print model_options

    print 'Loading data'
    load_data, prepare_data = get_dataset(model_options['dataset'])

    # Load data from data path
    if 'switch_test_val' in model_options and model_options['switch_test_val']:
        train, valid, worddict = load_data(path=osp.join(
            model_options['prefix'], model_options['dataset']),
                                           options=model_options,
                                           load_train=True,
                                           load_test=True)
    else:
        train, valid, worddict = load_data(path=osp.join(
            model_options['prefix'], model_options['dataset']),
                                           options=model_options,
                                           load_train=True,
                                           load_val=True)

    # Automatically calculate the update frequency
    validFreq = len(train[0]) / model_options['batch_size']
    print "Validation frequency is %d" % validFreq

    word_idict = {vv: kk for kk, vv in worddict.iteritems()}
    model_options['n_words'] = len(worddict)

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options)
    # Initialize it with glove
    if 'VCemb' in params:
        params['VCemb'] = read_pkl(
            model_options['embedding']).astype('float32')

    # If there is a same experiment, don't use pretrained weights
    if os.path.exists('%s.npz' % saveto):
        print "Reloading model"
        params = load_params('%s.npz' % saveto, params)
    elif pretrained != '':
        params = load_params(pretrained, params,
                             False)  # Only pretrain the Language model

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    # In order, we get:
    #   1) trng - theano random number generator
    #   2) use_noise - flag that turns on dropout
    #   3) inps - inputs for f_grad_shared
    #   4) cost - log likelihood for each sentence
    #   5) opts_out - optional outputs (e.g selector)
    trng, use_noise, \
          inps, alphas,\
          cost, \
          opt_outs = \
          build_model(tparams, model_options)

    # Load evaluator to calculate bleu score
    evaluator = cocoEvaluation(model_options['dataset'])

    # To sample, we use beam search: 1) f_init is a function that initializes
    # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over
    # words and also the new "initial state/memory" see equation
    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # we want the cost without any the regularizers
    # define the log probability
    f_log_probs = theano.function(inps,
                                  -cost,
                                  profile=False,
                                  updates=None,
                                  allow_input_downcast=True)

    # Define the cost function + Regularization
    cost = cost.mean()
    # add L2 regularization costs
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # Doubly stochastic regularization
    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = sum([
            alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean()
            for alpha in alphas
        ])
        cost += alpha_reg

    # Backprop!
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    # to getthe cost after regularization or the gradients, use this

    # f_grad_shared computes the cost and updates adaptive learning rate variables
    # f_update updates the weights of the model
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams,
                                                               grads, inps,
                                                               cost)

    print 'Optimization'
    train_iter = HomogeneousData(train,
                                 batch_size=batch_size,
                                 maxlen=model_options['maxlen'])

    # history_bleu is a bare-bones training log, reload history
    history_bleu = []
    if os.path.exists('%s.npz' % saveto):
        history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist()
    start_epochs = len(history_bleu)
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(start_epochs, model_options['max_epochs']):
        n_samples = 0

        print 'Epoch ', eidx

        for caps in train_iter:
            n_samples += len(caps)
            uidx += 1
            # turn on dropout
            use_noise.set_value(1.)

            # preprocess the caption, recording the
            # time spent to help detect bottlenecks
            pd_start = time.time()
            x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2],
                                                   worddict, model_options)
            pd_duration = time.time() - pd_start

            if x is None:
                print 'Minibatch with zero sample under length ', model_options[
                    'maxlen']
                continue

            # get the cost for the minibatch, and update the weights
            ud_start = time.time()
            cost = f_grad_shared(x, mask, ctx, cnn_feats)

            print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost)

            f_update(model_options['lrate'])
            ud_duration = time.time(
            ) - ud_start  # some monitoring for each mini-batch

            # Numerical stability check
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration

            # Print a generated sample as a sanity check
            if numpy.mod(uidx, model_options['sampleFreq']) == 0:
                # turn off dropout first
                use_noise.set_value(0.)
                x_s = x
                mask_s = mask
                ctx_s = ctx
                # generate and decode the a subset of the current training batch
                for jj in xrange(numpy.minimum(10, len(caps))):
                    sample, score, alphas = gen_sample(
                        f_init,
                        f_next,
                        ctx_s[jj],
                        cnn_feats[jj],
                        model_options,
                        trng=trng,
                        maxlen=model_options['maxlen'])
                    # Decode the sample from encoding back to words
                    print 'Truth ', jj, ': ',
                    print seqs2words(x_s[:, jj], word_idict)
                    for kk, ss in enumerate([sample[0]]):
                        print 'Sample (', kk, ') ', jj, ': ',
                        print seqs2words(ss, word_idict)

            # Log validation loss + checkpoint the model with the best validation log likelihood
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)

                # Do evaluation on validation set
                imgid = collapse([elem[-1] for elem in valid[0]])
                caps = process_examples([f_init], [f_next], imgid, valid[1],
                                        valid[2], word_idict, model_options)
                folder = osp.join('../output', '%s_%s' % (saveto, 'val'))
                if not osp.exists(folder):
                    os.mkdir(folder)
                with open(osp.join(folder, 'captions_val2014_results.json'),
                          'w') as f:
                    json.dump(caps, f)
                eva_result = evaluator.evaluate(folder, False)
                if model_options['criterion'] == 'combine':
                    history_bleu.append(eva_result['Bleu_4'] +
                                        eva_result['CIDEr'])
                else:
                    history_bleu.append(eva_result[model_options['criterion']])

                # the model with the best validation long likelihood is saved seperately with a different name
                if uidx == 0 or history_bleu[-1] == max(history_bleu):
                    best_p = unzip(tparams)
                    print 'Saving model with best validation ll'
                    params = copy.copy(best_p)
                    params = unzip(tparams)
                    numpy.savez(saveto + '_bestll',
                                history_bleu=history_bleu,
                                **params)
                    bad_counter = 0

                # abort training if perplexity has been increasing for too long
                if len(history_bleu) > model_options[
                        'patience'] and history_bleu[-1] <= max(
                            history_bleu[:-model_options['patience']]):
                    bad_counter += 1
                    if bad_counter > model_options['patience']:
                        print 'Early Stop!'
                        estop = True
                        break

                print ' BLEU-4 score ', history_bleu[-1]

            # Checkpoint
            if numpy.mod(uidx, model_options['saveFreq']) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_bleu=history_bleu, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

        print 'Seen %d samples' % n_samples

        if estop:
            break

        if model_options['save_per_epoch']:
            numpy.savez(saveto + '_epoch_' + str(eidx + 1),
                        history_bleu=history_bleu,
                        **unzip(tparams))

    # use the best nll parameters for final checkpoint (if they exist)
    if best_p is not None:
        zipp(best_p, tparams)
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_bleu=history_bleu,
                **params)