Beispiel #1
0
def main(cur_params):
  # fetch the data provider
  for i, cpf in enumerate(cur_params['checkpoints']):
    checkpoint = pickle.load(open(cpf, 'rb'))
    if 'model' in checkpoint:
        model_init_gen_from = checkpoint.get('model',{})
    else:
        model_init_gen_from = checkpoint.get('modelGen',{})
    model_init_eval_from = checkpoint.get('modelEval',{})
    params = checkpoint['params']

    # Load data provider and copy misc
    if i == 0:
        dp = getDataProvider(params)
        evaluator = decodeEvaluator(params)
        modelEval = evaluator.model_th
        (eval_inp_list, f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params)

    misc = checkpoint['misc']

    zipp(model_init_eval_from, modelEval)
    evaluator.use_noise.set_value(1.)

    print '----------------------- Running model %s  -------------------------------'%(cpf.split('_')[-3])
    print 'Evaluating GT 5 vs Negative samples from GT'
    eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.5, 0.0])
    print '-------------------------------------------------------------------------'
    print 'Evaluating GT vs repeated GT'
    eval_discrm_gen('val', dp, params, f_pred_fns[0], misc, probs = [0.5, 0.0, 0.5])
    print '-------------------------------------------------------------------------'
  def prepPredictor(self, model_npy, checkpoint_params, beam_size):
    zipp(model_npy, self.model_th)

    #theano.config.exception_verbosity = 'high'

	# Now we build a predictor model
    (inp_list, predLogProb, predIdx, predCand) = self.build_prediction_model(self.model_th, checkpoint_params, beam_size)
    self.f_pred_th = theano.function(inp_list, [predLogProb, predIdx, predCand], name='f_pred')

	# Now we build a training model which evaluates cost. This is for the evaluation part in the end
    (self.use_dropout, inp_list2,
     f_pred_prob, cost, predTh, updatesLSTM) = self.build_model(self.model_th, checkpoint_params)
    self.f_eval= theano.function(inp_list2, cost, name='f_eval')
    def prepPredictor(self, model_npy, checkpoint_params, beam_size):
        zipp(model_npy, self.model_th)

        #theano.config.exception_verbosity = 'high'

        # Now we build a predictor model
        (inp_list, predLogProb, predIdx,
         predCand) = self.build_prediction_model(self.model_th,
                                                 checkpoint_params, beam_size)
        self.f_pred_th = theano.function(inp_list,
                                         [predLogProb, predIdx, predCand],
                                         name='f_pred')

        # Now we build a training model which evaluates cost. This is for the evaluation part in the end
        (self.use_dropout, inp_list2, f_pred_prob, cost, predTh,
         updatesLSTM) = self.build_model(self.model_th, checkpoint_params)
        self.f_eval = theano.function(inp_list2, cost, name='f_eval')
    def prepPredictor(self,
                      model_npy=None,
                      checkpoint_params=None,
                      beam_size=5,
                      xI=None,
                      xAux=None,
                      inp_list_prev=[],
                      per_word_logweight=None):
        if model_npy != None:
            if type(model_npy[model_npy.keys()[0]]) == np.ndarray:
                zipp(model_npy, self.model_th)
            else:
                self.model_th = model_npy

        #theano.config.exception_verbosity = 'high'
        self.beam_size = beam_size

        # Now we build a predictor model
        if checkpoint_params.get('advers_gen', 0) == 1:
            checkpoint_params['n_gen_samples'] = beam_size
        (inp_list_gen, predLogProb, predIdx, predCand, wOut_emb, updates,
         seq_lengths) = self.build_prediction_model(
             self.model_th,
             checkpoint_params,
             xI,
             xAux,
             per_word_logweight=per_word_logweight)
        self.f_pred_th = theano.function(inp_list_prev + inp_list_gen,
                                         [predLogProb, predIdx, predCand],
                                         name='f_pred')

        # Now we build a training model which evaluates cost. This is for the evaluation part in the end
        if checkpoint_params.get('advers_gen', 0) == 0:
            (self.use_dropout, inp_list_gen2, f_pred_prob, cost, predTh,
             updatesLSTM) = self.build_model(self.model_th, checkpoint_params,
                                             xI, xAux)
            self.f_eval = theano.function(inp_list_prev + inp_list_gen2,
                                          cost,
                                          name='f_eval')
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = dp.aux_inp_size
    params['image_feat_size'] = dp.img_feat_size

    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)
    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    params['use_dropout'] = 1

    # This initializes the model parameters and does matrix initializations
    lstmGenerator = LSTMGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack, not happy about it
    model['bd'].set_value(bias_init_vector.astype(config.floatX))

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model, params)

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))

    for it in xrange(max_iters):
        t0 = time.time()
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        if params['use_pos_tag'] != 'None':
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None,
                                               sentTagMap, misc['ixtoword'])
        else:
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'])

        # Enable using dropout in training
        use_dropout.set_value(1.)

        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        total_cost = cost[0]
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
        epoch, total_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            if params['sample_by_len'] == 1:
                print len_hist

            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (
                        params['dataset'], host, params['fappend'], val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e
Beispiel #6
0
def main(scriptparams):
    checkpoint = pickle.load(open(scriptparams['checkpoint'], 'rb'))
    npfilename = osp.join(
        'scorelogs',
        osp.basename(scriptparams['checkpoint']).split('.')[0] + '_logprob%s' %
        (scriptparams['split']))
    misc = checkpoint['misc']

    # fetch the data provider
    params = checkpoint['params']
    params['use_gumbel_mse'] = 0
    params['maxlen'] = scriptparams['maxlen']

    dp = getDataProvider(params)
    model_init_gen_from = checkpoint.get(
        'model', {}) if 'model' in checkpoint else checkpoint['modelGen']

    lstmGenerator = decodeGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update_list,
                                                 lstmGenerator.regularize)

    if params.get('use_encoder_for', 0) & 1:
        if params.get('encode_gt_sentences', 0):
            xI = tensor.zeros((batch_size, params['image_encoding_size']))
            imgFeatEnc_inp = []
        else:
            imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                                  params['word_encoding_size'],
                                                  params,
                                                  mdl_prefix='img_enc_',
                                                  features=dp.features.T)
            mdlLen = len(model.keys())
            model.update(imgFeatEncoder.model_th)
            assert (len(model.keys()) == (mdlLen +
                                          len(imgFeatEncoder.model_th.keys())))
            misc['update'].extend(imgFeatEncoder.update_list)
            misc['regularize'].extend(imgFeatEncoder.regularize)
            (imgenc_use_dropout, imgFeatEnc_inp, xI,
             updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if params.get('use_encoder_for', 0) & 2:
        aux_enc_inp = model['Wemb'] if params.get('encode_gt_sentences',
                                                  0) else dp.aux_inputs.T
        hid_size = params['featenc_hidden_size']
        auxFeatEncoder = RecurrentFeatEncoder(hid_size,
                                              params['image_encoding_size'],
                                              params,
                                              mdl_prefix='aux_enc_',
                                              features=aux_enc_inp)
        mdlLen = len(model.keys())
        model.update(auxFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(auxFeatEncoder.model_th.keys())))
        misc['update'].extend(auxFeatEncoder.update_list)
        misc['regularize'].extend(auxFeatEncoder.regularize)
        (auxenc_use_dropout, auxFeatEnc_inp, xAux,
         updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params)

        if params.get('encode_gt_sentences', 0):
            # Reshape it size(batch_size, n_gt, hidden_size)
            xAux = xAux.reshape(
                (-1, params['n_encgt_sent'], params['featenc_hidden_size']))
            # Convert it to size (batch_size, n_gt*hidden_size
            xAux = xAux.flatten(2)
    else:
        auxFeatEnc_inp = []
        xAux = None

    attn_nw_func = None

    (use_dropout, inp_list_gen, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model,
                                              params,
                                              xI,
                                              xAux,
                                              attn_nw=attn_nw_func)
    inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen

    f_eval = theano.function(inp_list, cost, name='f_eval')
    #--------------------------------- Cost function and gradient computations setup #---------------------------------#

    zipp(model_init_gen_from, model)
    # perform the evaluation on VAL set
    #val_sc = eval_split_theano(scriptparams['split'], dp, model, params, misc, f_eval)
    logppl = []
    logppln = []
    imgids = []
    nsent = 0

    for batch in dp.iterImageSentencePairBatch(split=scriptparams['split'],
                                               max_batch_size=1,
                                               max_images=-1):
        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=params.get('use_encoder_for', 0),
            maxlen=params['maxlen'],
            use_shared_mem=params.get('use_shared_mem_enc', 0),
            enc_gt_sent=params.get('encode_gt_sentences', 0),
            n_enc_sent=params.get('n_encgt_sent', 0),
            wordtoix=misc['wordtoix'])
        gen_inp_list, lenS = prepare_data(
            batch,
            misc['wordtoix'],
            rev_sents=params.get('reverse_sentence', 0),
            use_enc_for=params.get('use_encoder_for', 0),
            use_unk_token=params.get('use_unk_token', 0))

        inp_list = enc_inp_list + gen_inp_list
        cost = f_eval(*inp_list)
        logppl.append(cost[1])
        logppln.append(lenS)
        imgids.append(
            str(batch[0]['image']['cocoid']) + '_' + str(batch[0]['sentidx']))
        nsent += 1

    perplex = 2**(np.array(logppl) / np.array(logppln))
    np.savez(npfilename, pplx=perplex, keys=np.array(imgids))

    #ppl2 = 2 ** (logppl / logppln)
    #print 'evaluated %d sentences and got perplexity = %f' % (nsent, ppl2)
    #met = [ppl2]

    print 2**(np.array(logppl).sum() / np.array(logppln).sum())
def main(params):
  checkpoint_path = params['checkpoint_path']
  print 'loading checkpoint %s' % (checkpoint_path, )
  checkpoint = pickle.load(open(checkpoint_path, 'rb'))
  checkpoint_params = checkpoint['params']
  model_npy = checkpoint['model']
  
  # Load the candidates db generated from rnn's
  candDb = json.load(open(params['candDb'],'r'))
  wordtoix = checkpoint['wordtoix']

  #find the number of candidates per image and max sentence len
  batch_size = 0
  maxlen = 0
  for i,img in enumerate(candDb['imgblobs']):
    for ids,cand in enumerate(img['candidatelist']):
        tks = cand['text'].split(' ')
        # Also tokenize the candidates
        candDb['imgblobs'][i]['candidatelist'][ids]['tokens'] = tks
        if len(tks) > maxlen:
            maxlen = len(tks)
    if batch_size < len(img['candidatelist']):
        batch_size = len(img['candidatelist'])

  # Get all images to this batch size!
  # HACK!!
  maxlen = 24
  checkpoint_params['maxlen'] = maxlen
 
  checkpoint_params['batch_size'] = batch_size
  print maxlen

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  
  # This initializes the model parameters and does matrix initializations  
  checkpoint_params['mode'] = 'predict' 
  evalModel = decodeEvaluator(checkpoint_params)
  model = evalModel.model_th
  
  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     f_pred_fns, cost, predTh, model) = evalModel.build_model(model, checkpoint_params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition

  # Now let's build a gradient computation graph and rmsprop update mechanism
  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  zipp(model_npy,model)
  print("\nPredicting using model %s, run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_path, checkpoint['epoch'], \
    checkpoint['perplexity']))
  
  pos_samp = np.arange(1,dtype=np.int32)
  
  features,_ = loadArbitraryFeatures(params, -1)

  #Disable using dropout in training 
  use_dropout.set_value(0.)
  N = len(candDb['imgblobs'])
  #################### Main Loop ############################################
  for i,img in enumerate(candDb['imgblobs']):
    # fetch a batch of data
    print 'image %d/%d  \r' % (i, N),
    batch = []
    cbatch_len  = len(img['candidatelist'])
    for s in img['candidatelist']:
        batch.append({'sentence':s})
    if cbatch_len < batch_size:
        for z in xrange(batch_size - cbatch_len):
            batch.append({'sentence':img['candidatelist'][-1]})

    batch[0]['image'] = {'feat':features[:, img['imgid']]}
    real_inp_list, lenS = prepare_data(batch, wordtoix, maxlen=maxlen, pos_samp=pos_samp, prep_for=checkpoint_params['eval_model'])
    
    # evaluate cost, gradient and perform parameter update
    scrs = np.squeeze(f_pred_fns[1](*real_inp_list))
    scrs = scrs[:cbatch_len] # + scrs[:,cbatch_len:].sum()/cbatch_len
    for si,s in enumerate(img['candidatelist']):
        candDb['imgblobs'][i]['candidatelist'][si]['logprob'] = float(scrs[si])
        candDb['imgblobs'][i]['candidatelist'][si].pop('tokens')
    bestcand = scrs.argmax()
    candDb['imgblobs'][i]['candidate'] = candDb['imgblobs'][i]['candidatelist'][bestcand]
    srtidx = np.argsort(scrs)[::-1]
    candDb['imgblobs'][i]['candsort'] = list(srtidx)
    #import pdb;pdb.set_trace()
    # print training statistics

  print ""
  jsonFname = '%s_reranked_%s.json' % (checkpoint_params['eval_model'],params['fname_append'])
  save_file = os.path.join(params['root_path'], jsonFname)
  json.dump(candDb, open(save_file, 'w'))
  def build_eval_other_sent(self, tparams, options,model_npy):

    zipp(model_npy, self.model_th)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    xW = tensor.matrix('xW', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    n_timesteps = xW.shape[0]
    n_samples = xW.shape[1]

    embW = tparams['Wemb'][xW.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['word_encoding_size']])
    xI = tensor.matrix('xI', dtype=config.floatX)
    xAux = tensor.matrix('xAux', dtype=config.floatX)

    embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape([1,n_samples,options['image_encoding_size']]);
    emb = tensor.concatenate([embImg, embW], axis=0) 


    rval, updatesLSTM = self.lstm_layer(tparams, emb[:n_timesteps,:,:], xAux, use_noise, options, prefix=options['generator'],
                                mask=mask)
    p = rval[0]

    p = tensor.dot(p,tparams['Wd']) + tparams['bd']

    #pred = tensor.nnet.softmax(p)

    #pred = rval[2]

    #pred = pred[1:,:,:]
    p = p[1:,:,:]

    def accumCost(pred,xW,m,c_sum,ppl_sum):
        pred = tensor.nnet.softmax(pred)
        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        return c_sum, ppl_sum

    sums, upd = theano.scan(fn=accumCost, 
                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
                                sequences = [p, xW[1:,:], mask[1:,:]])

    # NOTE1: we are leaving out the first prediction, which was made for the image
    # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
    # perplexity (log2)
    cost = sums[0][-1]

    inp_list = [xW, xI, mask]

    if options.get('en_aux_inp',0):
        inp_list.append(xAux)

    f_pred_prob = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)

    self.f_pred_prob_other = theano.function(inp_list, p, name='f_pred_prob', updates=updatesLSTM)
    #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

    #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

    self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

    return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM 
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
            [n_out_samps, options['output_size']])

        pWSft = tensor.nnet.softmax(pW)
        totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]

        #    #pred = tensor.nnet.softmax(p)
        #
        #    #pred = rval[2]
        #
        #    #pred = pred[1:,:,:]
        #
        #    def accumCost(pred,xW,m,c_sum,ppl_sum):
        #        pred = tensor.nnet.softmax(pred)
        #        c_sum += (tensor.log(pred[tensor.arange(n_samples), xW]+1e-20) * m)
        #        ppl_sum += -(tensor.log2(pred[tensor.arange(n_samples), xW]+1e-10) * m)
        #        return c_sum, ppl_sum
        #
        #    sums, upd = theano.scan(fn=accumCost,
        #                                outputs_info=[tensor.alloc(numpy_floatX(0.), 1,n_samples),
        #                                              tensor.alloc(numpy_floatX(0.), 1,n_samples)],
        #                                sequences = [p, xW[1:,:], mask[1:,:]])
        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum()
        cost = tot_cost / options['batch_size']

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
def main(params):
    checkpoint_path = params['checkpoint_path']
    print 'loading checkpoint %s' % (checkpoint_path, )
    checkpoint = pickle.load(open(checkpoint_path, 'rb'))
    cp_params = checkpoint['params']
    model_npy = checkpoint['model']

    # Load the candidates db generated from rnn's
    if params['candDb'] != None:
        candDb = json.load(open(params['candDb'], 'r'))
    else:
        candDb = mergeRes(params)

    wordtoix = checkpoint[
        'wordtoix'] if 'wordtoix' in checkpoint else checkpoint['misc'][
            'wordtoix']

    # Read labels and build cocoid to imgid Map
    if params['dataset'] == 'coco':
        lbls = open(params['lblF'], 'r').read().splitlines()
        objId2Imgid = {}
        for lb in lbls:
            objId2Imgid[str(int(lb.split()[1][1:-1]))] = int(lb.split()[0][1:])
        features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures(
            params, Ellipsis)

    elif params['dataset'] == 'msr-vtt':
        img_names_list = open(params['lblF'], 'r').read().splitlines()
        auxidxes = []
        img_names = [x.rsplit(',')[0] for x in img_names_list]
        objId2Imgid = {imn.split('.')[0]: i for i, imn in enumerate(img_names)}
        if len(img_names_list[0].split(',', 1)) > 1:
            if type(
                    ast.literal_eval(img_names_list[0].split(
                        ',', 1)[1].strip())) == tuple:
                idxes = [
                    ast.literal_eval(x.split(',', 1)[1].strip())[0]
                    for x in img_names_list
                ]
                auxidxes = [
                    ast.literal_eval(x.split(',', 1)[1].strip())[1]
                    for x in img_names_list
                ]
            else:
                idxes = [
                    ast.literal_eval(x.split(',', 1)[1].strip())
                    for x in img_names_list
                ]
        else:
            idxes = xrange(len(img_names_list))
        params['poolmethod'] = cp_params['poolmethod'] if params[
            'poolmethod'] == None else params['poolmethod']
        features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures(
            params, idxes, auxidxes=auxidxes)

    elif params['dataset'] == 'lsmdc':
        if params['use_label_file'] == 1:
            params['poolmethod'] = cp_params['poolmethod'] if params[
                'poolmethod'] == None else params['poolmethod']
            params['labels'] = cp_params['labels'] if params[
                'labels'] == None else params['labels']
            params['featfromlbl'] = cp_params['featfromlbl'] if params[
                'featfromlbl'] == None else params['featfromlbl']
            params['uselabel'] = cp_params['uselabel'] if params[
                'uselabel'] == None else params['uselabel']
        else:
            params['uselabel'] = 0
        img_names_list = open(params['lblF'], 'r').read().splitlines()
        img_names = [x.rsplit(',')[0] for x in img_names_list]
        idxes = [int(x.rsplit(',')[1]) for x in img_names_list]
        auxidxes = []
        objId2Imgid = {
            osp.basename(imn).split('.')[0]: i
            for i, imn in enumerate(img_names)
        }

        #import pdb;pdb.set_trace()
        features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures(
            params, idxes, auxidxes=auxidxes)

    if cp_params.get('use_encoder_for', 0) & 1:
        imgFeatEncoder = RecurrentFeatEncoder(cp_params['image_feat_size'],
                                              cp_params['sent_encoding_size'],
                                              cp_params,
                                              mdl_prefix='img_enc_',
                                              features=features.T)
        zipp(model_npy, imgFeatEncoder.model_th)
        (imgenc_use_dropout, imgFeatEnc_inp, xI,
         updatesLSTMImgFeat) = imgFeatEncoder.build_model(
             imgFeatEncoder.model_th, cp_params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if 'eval_model' not in cp_params:
        cp_params['eval_model'] = params['eval_model']
        print 'Using evaluator module: ', cp_params['eval_model']

    #find the number of candidates per image and max sentence len
    batch_size = 0
    maxlen = 0
    for i, img in enumerate(candDb['imgblobs']):
        for ids, cand in enumerate(img['candidatelist']):
            tks = cand['text'].split(' ')
            # Also tokenize the candidates
            candDb['imgblobs'][i]['candidatelist'][ids]['tokens'] = tks
            if len(tks) > maxlen:
                maxlen = len(tks)
        if batch_size < len(img['candidatelist']):
            batch_size = len(img['candidatelist'])

    # Get all images to this batch size!
    # HACK!!
    maxlen = 24
    cp_params['maxlen'] = maxlen

    cp_params['batch_size'] = batch_size
    print maxlen

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times

    # This initializes the model parameters and does matrix initializations
    cp_params['mode'] = 'predict'
    evalModel = decodeEvaluator(cp_params)
    model = evalModel.model_th

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_eval, f_pred_fns, cost, predTh,
     modelUpd) = evalModel.build_model(model,
                                       cp_params,
                                       xI=xI,
                                       prior_inp_list=imgFeatEnc_inp)

    inp_list = imgFeatEnc_inp + inp_list_eval

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition

    # Now let's build a gradient computation graph and rmsprop update mechanism
    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    ## Initialize the model parameters from the checkpoint file if we are resuming training
    model = modelUpd if cp_params['eval_model'] == 'cnn' else model
    zipp(model_npy, model)
    print("\nPredicting using model %s, run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_path, checkpoint['epoch'], \
      checkpoint['perplexity']))

    pos_samp = np.arange(
        1, dtype=np.int32) if cp_params['eval_model'] == 'cnn' else []

    #Disable using dropout in training
    use_dropout.set_value(0.)
    if cp_params.get('use_encoder_for', 0) & 1:
        imgenc_use_dropout.set_value(0.)
    N = len(candDb['imgblobs'])
    stats = np.zeros((batch_size))
    #################### Main Loop ############################################
    for i, img in enumerate(candDb['imgblobs']):
        # fetch a batch of data
        print 'image %d/%d  \r' % (i, N),
        batch = []
        cbatch_len = len(img['candidatelist'])
        objid = osp.basename(img['img_path']).split('_')[-1].split('.')[0]
        if params['dataset'] == 'coco':
            objid = str(int(objid))

        for s in img['candidatelist']:
            batch.append({
                'sentence': s,
                'image': {
                    'feat': features[:, feat_idx[objId2Imgid[objid]]].T,
                    'img_idx': feat_idx[objId2Imgid[objid]]
                }
            })
            if params['aux_inp_file'] != None:
                batch[-1]['aux_inp'] = aux_inp[:,
                                               aux_idx[objId2Imgid[objid]]].T

        if cbatch_len < batch_size and (cp_params['eval_model'] == 'cnn'):
            for z in xrange(batch_size - cbatch_len):
                batch.append({'sentence': img['candidatelist'][-1]})

        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=cp_params.get('use_encoder_for', 0),
            use_shared_mem=cp_params.get('use_shared_mem_enc', 0),
            pos_samp=pos_samp)
        eval_inp_list, lenS = prepare_data(batch,
                                           wordtoix,
                                           maxlen=maxlen,
                                           pos_samp=pos_samp,
                                           prep_for=cp_params['eval_model'],
                                           use_enc_for=cp_params.get(
                                               'use_encoder_for', 0))

        real_inp_list = enc_inp_list + eval_inp_list

        #import pdb;pdb.set_trace()
        # evaluate cost, gradient and perform parameter update
        scrs = np.squeeze(f_pred_fns[1](*real_inp_list))
        scrs = scrs[:cbatch_len]  # + scrs[:,cbatch_len:].sum()/cbatch_len
        for si, s in enumerate(img['candidatelist']):
            candDb['imgblobs'][i]['candidatelist'][si]['logprob'] = float(
                scrs[si])
            candDb['imgblobs'][i]['candidatelist'][si].pop('tokens')
        bestcand = scrs.argmax()
        stats[bestcand] += 1.0
        candDb['imgblobs'][i]['candidate'] = candDb['imgblobs'][i][
            'candidatelist'][bestcand]
        srtidx = np.argsort(scrs)[::-1]
        candDb['imgblobs'][i]['candsort'] = list(srtidx)
        # print training statistics

    print ""
    jsonFname = '%s_reranked_%s.json' % (cp_params['eval_model'],
                                         params['fname_append'])
    save_file = os.path.join(params['root_path'], jsonFname)
    json.dump(candDb, open(save_file, 'w'))
    print 'Written to file %s' % save_file
    print 'Final stats are:'
    print stats * 100.0 / N
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]
        n_out_samps = (n_timesteps - 1) * n_samples

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        if options.get('swap_aux', 0):
            xAuxEmb = tensor.dot(xAux,
                                 tparams['WIemb_aux']) + tparams['b_Img_aux']
        else:
            xAuxEmb = xAux

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = basic_lstm_layer(tparams,
                                             emb[:n_timesteps, :, :],
                                             xAuxEmb,
                                             use_noise,
                                             options,
                                             prefix=options['generator'])
        p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1),
                   options['hidden_size'])

        if options.get('class_out_factoring', 0) == 0:
            pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape(
                [n_out_samps, options['output_size']])
            pWSft = tensor.nnet.softmax(pW)
            totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()]
            out_list = [pWSft, totProb, p]
        else:
            ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo)
            xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0]
            pW = ((tparams['Wd'][:, xC, :].T *
                   ((p.reshape([1, n_out_samps, options['hidden_size']]) -
                     tparams['WdCls'][:, xC].T))).sum(axis=-1).T +
                  tparams['bd'][:, xC, :])
            pWSft = tensor.nnet.softmax(pW[0, :, :])
            pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape(
                [n_out_samps, options['nClasses']])
            pCSft = tensor.nnet.softmax(pC)

            totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \
                      pCSft[tensor.arange(n_out_samps), xC]

        tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()
                     ).reshape([n_timesteps - 1, n_samples])
        cost = tot_cost.sum(axis=0)

        inp_list = [xW, mask, xI]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        self.f_pred_prob_other = theano.function([xW, xI, xAux],
                                                 pWSft,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
Beispiel #12
0
def main(params):

    # load the checkpoint
    if params['multi_model'] == 0:
        checkpoint_path = params['checkpoint_path']
        print 'loading checkpoint %s' % (checkpoint_path, )
        checkpoint = pickle.load(open(checkpoint_path, 'rb'))
        checkpoint_params = checkpoint['params']
        model_npy = checkpoint['model']
        checkpoint_params['use_theano'] = 1
        if 'image_feat_size' not in checkpoint_params:
            checkpoint_params['image_feat_size'] = 4096

        BatchGenerator = decodeGenerator(checkpoint_params)
        # Compile and init the theano predictor
        BatchGenerator.prepPredictor(model_npy, checkpoint_params,
                                     params['beam_size'])
        model = BatchGenerator.model_th
    else:
        BatchGenerator = []
        model_npy = []
        modelTh = []
        checkpoint_params = []
        for i, checkpoint_path in enumerate(params['checkpoint_path']):
            checkpoint = pickle.load(open(checkpoint_path, 'rb'))
            model_npy.append(checkpoint['model'])
            checkpoint_params.append(checkpoint['params'])
            checkpoint_params[i]['use_theano'] = 1
            BatchGenerator.append(decodeGenerator(checkpoint_params[i]))
            zipp(model_npy[i], BatchGenerator[i].model_th)
            modelTh.append(BatchGenerator[i].model_th)
            modelTh[i]['comb_weight'] = 1.0 / params['nmodels']

        BatchGenerator[0].prepMultiPredictor(modelTh, checkpoint_params,
                                             params['beam_size'],
                                             params['nmodels'])

    misc = {}
    ixtoword = checkpoint['ixtoword']
    misc['wordtoix'] = checkpoint['wordtoix']

    # output blob which we will dump to JSON for visualizing the results
    blob = {}
    blob['params'] = params
    blob['checkpoint_params'] = checkpoint_params
    blob['imgblobs'] = []

    # load the tasks.txt file and setupe feature loading
    root_path = params['root_path']
    img_names_list = open(params['imgList'], 'r').read().splitlines()

    if len(img_names_list[0].rsplit(',')) > 1:
        img_names = [x.rsplit(',')[0] for x in img_names_list]
        idxes = [int(x.rsplit(',')[1]) for x in img_names_list]
    else:
        img_names = img_names_list
        idxes = xrange(len(img_names_list))

    #if checkpoint_params.get('en_aux_inp',0) and (params.get('aux_inp_file','None') == 'None'):
    #  raise ValueError('ERROR: please specify auxillary input feature using --aux_inp_file')
    #  return
    # load the features for all images
    features, aux_inp = loadArbitraryFeatures(params, idxes)

    N = len(img_names)

    # iterate over all images and predict sentences
    print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \
      checkpoint['perplexity']))

    kwparams = {'beam_size': params['beam_size']}

    jsonFname = 'result_struct_%s.json' % (params['fname_append'])
    save_file = os.path.join(root_path, jsonFname)

    for n in xrange(N):
        print 'image %d/%d:' % (n, N)

        # encode the image
        if params['multi_model'] == 0:
            D, NN = features.shape
            img = {}
            img['feat'] = features[:, n]
            if checkpoint_params.get('en_aux_inp', 0):
                img['aux_inp'] = aux_inp[:, n]
            img['local_file_path'] = img_names[n]
            # perform the work. heavy lifting happens inside
            Ys = BatchGenerator.predict([{
                'image': img
            }], model, checkpoint_params, **kwparams)
        else:
            kwparams['nmodels'] = params['nmodels']
            batch = []
            for i in xrange(params['nmodels']):
                img = {}
                img['feat'] = features[i][:, n]
                if checkpoint_params[i].get('en_aux_inp', 0):
                    img['aux_inp'] = aux_inp[i][:, n]
                img['local_file_path'] = img_names[n]
                batch.append({'image': img})
            Ys = BatchGenerator[0].predictMulti(batch, checkpoint_params,
                                                **kwparams)

        # build up the output
        img_blob = {}
        img_blob['img_path'] = img['local_file_path']

        # encode the top prediction
        top_predictions = Ys[
            0]  # take predictions for the first (and only) image we passed in
        top_prediction = top_predictions[
            0]  # these are sorted with highest on top
        candidate = ' '.join([
            ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0
        ])  # ix 0 is the END token, skip that
        print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate)
        img_blob['candidate'] = {
            'text': candidate,
            'logprob': float(top_prediction[0])
        }

        # Code to save all the other candidates
        candlist = []
        for ci in xrange(len(top_predictions) - 1):
            prediction = top_predictions[
                ci + 1]  # these are sorted with highest on top
            candidate = ' '.join([
                ixtoword[int(ix)] for ix in prediction[1] if ix > 0
            ])  # ix 0 is the END token, skip that
            candlist.append({
                'text': candidate,
                'logprob': float(prediction[0])
            })

        img_blob['candidatelist'] = candlist
        blob['imgblobs'].append(img_blob)
        if (n % 5000) == 1:
            print 'writing predictions to %s...' % (save_file, )
            json.dump(blob, open(save_file, 'w'))

    # dump result struct to file
    print 'writing predictions to %s...' % (save_file, )
    json.dump(blob, open(save_file, 'w'))

    # dump output html
    html = ''
    for img in blob['imgblobs']:
        html += '<img src="%s" height="400"><br>' % (img['img_path'], )
        html += '(%f) %s <br><br>' % (img['candidate']['logprob'],
                                      img['candidate']['text'])

    html_file = 'result_%s.html' % (params['fname_append'])
    html_file = os.path.join(root_path, html_file)
    print 'writing html result file to %s...' % (html_file, )
    open(html_file, 'w').write(html)
def main(params):
  batch_size = params['batch_size']
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)
  
  # Initialize the optimizer 
  solver = Solver(params['solver'])

  params['aux_inp_size'] = dp.aux_inp_size
  params['image_feat_size'] = dp.img_feat_size

  print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size'])

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['vocabulary_size'] = len(misc['wordtoix'])
  params['output_size'] = len(misc['ixtoword']) # these should match though
  params['use_dropout'] = 1 

  # This initializes the model parameters and does matrix initializations  
  generator = decodeGenerator(params)
  (gen_inp_list, predLogProb, predIdx, predCand, wOut_emb, updatesLstm) = generator.build_prediction_model(
                                            generator.model_th, params, params['beam_size'])
  wOut_emb = wOut_emb.reshape([wOut_emb.shape[0],wOut_emb.shape[2]])
  f_gen_only = theano.function(gen_inp_list, [predLogProb, predIdx, wOut_emb], name='f_pred', updates=updatesLstm)
  
  modelGen = generator.model_th
  upListGen = generator.update_list
 
  if params['share_Wemb']:
     evaluator = decodeEvaluator(params, modelGen['Wemb'])
  else:
     evaluator = decodeEvaluator(params)
  modelEval = evaluator.model_th
  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  
  (use_dropout_eval, eval_inp_list,
     f_pred_fns, costs, predTh, modelEval) = evaluator.build_advers_eval(modelEval, params, gen_inp_list, wOut_emb)
  
  # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
  # This is a bit of a hack, not happy about it
  comb_inp_list = eval_inp_list
  for inp in gen_inp_list:
    if inp not in comb_inp_list:
        comb_inp_list.append(inp)
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(comb_inp_list, costs, name='f_eval', updates=updatesLstm)

  # Now let's build a gradient computation graph and rmsprop update mechanism
  if params['share_Wemb']:
    modelEval.pop('Wemb')
  if params['fix_Wemb']:
    upListGen.remove('Wemb')
  
  modelGenUpD =  OrderedDict()
  for k in upListGen:
   modelGenUpD[k] = modelGen[k]
  gradsEval = tensor.grad(costs[0], wrt=modelEval.values(),add_names=True)
  gradsGen = tensor.grad(costs[1], wrt=modelGenUpD.values(), add_names=True)
 
  lrEval = tensor.scalar(name='lrEval',dtype=config.floatX)
  f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval= solver.build_solver_model(lrEval, modelEval, gradsEval,
                                      comb_inp_list, costs[0], params)
  
  lrGen = tensor.scalar(name='lrGen',dtype=config.floatX)
  f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model(lrGen, modelGenUpD, gradsGen,
                                      comb_inp_list, costs[1], params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(modelGen.keys())

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'images')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  iters_eval= num_iters_one_epoch//2
  iters_gen = num_iters_one_epoch//4

  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  top_val_ppl2 = -1
  smooth_train_ppl2 = 0.5 # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  t_print_sec = 60
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != 'None':
    zipp(model_init_from,modelGen)
    #zipp(rg_init,rgGen)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  
  pos_samp = np.arange(batch_size,dtype=np.int32)
  print batch_size

  ##############################################################
  # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point
  def signal_handler(signal, frame):
    print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!')
    filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % (params['dataset'], host, params['fappend'], val_ppl2)
    dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
    sys.exit(0)
  signal.signal(signal.SIGINT, signal_handler)
  ##############################################################

  for it in xrange(max_epochs):
    epoch = it * 1.0 / num_iters_one_epoch
    # Enable using dropout in training 
    use_dropout_eval.set_value(1.)
    for it2 in xrange(iters_eval): 
        t0 = time.time()
        # fetch a batch of data
        batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs'])
        real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs'])
        
        # evaluate cost, gradient and perform parameter update
        cost = f_grad_comp_eval(*real_inp_list)
        f_param_update_eval(params['learning_rate_eval'])
        dt = time.time() - t0
        # Track training statistics
        train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
        if it2 == 0: smooth_train_ppl2 = train_ppl2 
        if it2 == 0: smooth_train_cost = cost
        else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost 
        
        tnow = time.time()
        if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report
          print 'Eval Cnn in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_eval, dt, \
	    	smooth_train_cost,smooth_train_ppl2)
          last_status_write_time = tnow
    
    print 'Done training the descriminative model for now. Switching to Genereative model'
    print 'Eval N/W in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2)

    
    filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_EVOnly.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2)
    dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
    
    
    # Disable Cnn dropout while training gen network
    use_dropout_eval.set_value(0.)
    for it2 in xrange(iters_gen): 
        t0 = time.time()
        # fetch a batch of data
        batch,_ = dp.sampPosNegSentSamps(params['eval_batch_size'] - params['rand_negs'])
        real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], maxlen=params['maxlen'], pos_samp=pos_samp, prep_for=params['eval_model'], rand_negs = params['rand_negs'])
        #import pdb; pdb.set_trace()

        # evaluate cost, gradient and perform parameter update
        #if any([np.isnan(modelGen[m].get_value()).any() for m in modelGen]):
        #    print 'Somebodys NAN!!!'
        #    break;
        #asd = f_gen_only(real_inp_list[2],real_inp_list[3])
        
        #print it2,asd[-1].shape, real_inp_list[0].shape

        #if asd[-1].shape[0] > real_inp_list[0].shape[0]:
        #   import pdb; pdb.set_trace()


        cost = f_grad_comp_gen(*real_inp_list)

        #print it2,cost
        
        #if any([np.isnan(zg_gen[i].get_value()).any() for i in xrange(len(zg_gen))]):
        #    print 'Somebody zg is NAN!!!'
        #    break;
        #if any([np.isnan(rg_gen[i].get_value()).any() for i in xrange(len(rg_gen))]) or any([(rg_gen[i].get_value()<0).any() for i in xrange(len(rg_gen))]):
        #    print 'Somebody rg is NAN!!!'
        #    break;
        
        f_param_update_gen(params['learning_rate_gen'])
        dt = time.time() - t0
        # print training statistics
        train_ppl2 = (np.e**(-cost)) #step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
        if it2 == 0: smooth_train_ppl2 = train_ppl2 
        if it2 == 0: smooth_train_cost = cost
        else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * cost 
        
        tnow = time.time()
        if tnow > last_status_write_time + t_print_sec*1: # every now and then lets write a report
          print 'Gen Lstm in epoch %d: %d/%d sample done in %.3fs. Cost now is %.3f Pplx is %.3f' % (it, it2, iters_gen, dt, \
	    	smooth_train_cost,smooth_train_ppl2)
          last_status_write_time = tnow
    
    print 'Done training the generative model for now. Switching to Genereative model. Final Stats are:'
    print 'Gen Lstm in epoch %d: Cost now is %.3f Pplx is %.3f' % (it, smooth_train_cost,smooth_train_ppl2)
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    is_last_iter = 1
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
     # use_dropout.set_value(0.)

     # val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
     # 
     # if it - params['lr_decay_st_epoch'] >= 0:
     #   params['learning_rate'] = params['learning_rate'] * params['lr_decay']
     #   params['lr_decay_st_epoch'] += 1
     # 
     # print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
     # if params['sample_by_len'] == 1:
     #   print len_hist
        
      val_ppl2 = smooth_train_ppl2
      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          #top_val_ppl2 = val_ppl2
          filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % (params['dataset'], host, params['fappend'],it, smooth_train_ppl2)
          dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2)
    def build_eval_other_sent(self, tparams, options, model_npy):

        zipp(model_npy, self.model_th)

        # Used for dropout.
        use_noise = theano.shared(numpy_floatX(0.))

        xW = tensor.matrix('xW', dtype='int64')
        mask = tensor.matrix('mask', dtype=config.floatX)
        n_timesteps = xW.shape[0]
        n_samples = xW.shape[1]

        embW = tparams['Wemb'][xW.flatten()].reshape(
            [n_timesteps, n_samples, options['word_encoding_size']])
        xI = tensor.matrix('xI', dtype=config.floatX)
        xAux = tensor.matrix('xAux', dtype=config.floatX)

        embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape(
            [1, n_samples, options['image_encoding_size']])
        emb = tensor.concatenate([embImg, embW], axis=0)

        rval, updatesLSTM = self.lstm_layer(tparams,
                                            emb[:n_timesteps, :, :],
                                            xAux,
                                            use_noise,
                                            options,
                                            prefix=options['generator'],
                                            mask=mask)
        p = rval[0]

        p = tensor.dot(p, tparams['Wd']) + tparams['bd']

        #pred = tensor.nnet.softmax(p)

        #pred = rval[2]

        #pred = pred[1:,:,:]
        p = p[1:, :, :]

        def accumCost(pred, xW, m, c_sum, ppl_sum):
            pred = tensor.nnet.softmax(pred)
            c_sum += (tensor.log(pred[tensor.arange(n_samples), xW] + 1e-20) *
                      m)
            ppl_sum += -(
                tensor.log2(pred[tensor.arange(n_samples), xW] + 1e-10) * m)
            return c_sum, ppl_sum

        sums, upd = theano.scan(fn=accumCost,
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), 1,
                                                 n_samples),
                                    tensor.alloc(numpy_floatX(0.), 1,
                                                 n_samples)
                                ],
                                sequences=[p, xW[1:, :], mask[1:, :]])

        # NOTE1: we are leaving out the first prediction, which was made for the image
        # and is meaningless. Here cost[0] contains log probability (log10) and cost[1] contains
        # perplexity (log2)
        cost = sums[0][-1]

        inp_list = [xW, xI, mask]

        if options.get('en_aux_inp', 0):
            inp_list.append(xAux)

        f_pred_prob = theano.function(inp_list,
                                      p,
                                      name='f_pred_prob',
                                      updates=updatesLSTM)

        self.f_pred_prob_other = theano.function(inp_list,
                                                 p,
                                                 name='f_pred_prob',
                                                 updates=updatesLSTM)
        #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred')

        #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean()

        self.f_eval_other = theano.function(inp_list, cost, name='f_eval')

        return use_noise, inp_list, self.f_pred_prob_other, cost, p, updatesLSTM
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']

    # fetch the data provider
    dp = getDataProvider(params)

    # Initialize the optimizer
    solver = Solver(params['solver'])

    params['aux_inp_size'] = dp.aux_inp_size
    params['image_feat_size'] = dp.img_feat_size

    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    misc = {
    }  # stores various misc items that need to be passed around the framework

    if params['checkpoint_file_name'] == 'None':
        # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
        # at least word_count_threshold number of times
        misc['wordtoix'], misc[
            'ixtoword'], bias_init_vector = preProBuildWordVocab(
                dp.iterSentences('train'), word_count_threshold)
    else:
        # Load Vocabulary from the checkpoint
        misc = checkpoint_init['misc']

    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though

    # This initializes the generator model parameters and does matrix initializations
    if params['t_eval_only'] == 0:
        generator = decodeGenerator(params)
        # Build the computational graph

        if params['use_encoder_for'] & 2:
            aux_enc_inp = generator.model_th['Wemb'] if params[
                'encode_gt_sentences'] else dp.aux_inputs.T
            hid_size = params['featenc_hidden_size']
            auxFeatEncoder = RecurrentFeatEncoder(
                hid_size,
                params['image_encoding_size'],
                params,
                mdl_prefix='aux_enc_',
                features=aux_enc_inp)
            mdlLen = len(generator.model_th.keys())
            generator.model_th.update(auxFeatEncoder.model_th)
            assert (len(generator.model_th.keys()) == (
                mdlLen + len(auxFeatEncoder.model_th.keys())))
            (auxenc_use_dropout, auxFeatEnc_inp, xAux,
             updatesLSTMAuxFeat) = auxFeatEncoder.build_model(
                 generator.model_th, params)

            if params['encode_gt_sentences']:
                # Reshape it size(batch_size, n_gt, hidden_size)
                xAux = xAux.reshape((-1, params['n_encgt_sent'],
                                     params['featenc_hidden_size']))
                # Convert it to size (batch_size, n_gt*hidden_size
                xAux = xAux.flatten(2)
                xI = tensor.zeros((batch_size, params['image_encoding_size']))
                imgFeatEnc_inp = []
        else:
            auxFeatEnc_inp = []
            imgFeatEnc_inp = []
            xAux = None
            xI = None

        (gen_inp_list, predLogProb, predIdx, predCand, gen_out, updatesLstm,
         seq_lengths) = generator.build_prediction_model(generator.model_th,
                                                         params,
                                                         xI=xI,
                                                         xAux=xAux)
        gen_inp_list = imgFeatEnc_inp + auxFeatEnc_inp + gen_inp_list
        gen_out = gen_out.reshape([
            gen_out.shape[0], -1, params['n_gen_samples'],
            params['vocabulary_size']
        ])
        #convert updates lstm to a tuple, this is to help merge it with grad updates
        updatesLstm = [(k, v) for k, v in updatesLstm.iteritems()]
        f_gen_only = theano.function(
            gen_inp_list, [predLogProb, predIdx, gen_out, seq_lengths],
            name='f_pred',
            updates=updatesLstm)

        modelGen = generator.model_th
        upListGen = generator.update_list

        if params['use_mle_train']:
            (use_dropout_genTF, inp_list_genTF, _, cost_genTF, _,
             updatesLSTM_genTF) = generator.build_model(
                 generator.model_th, params)
            f_eval_genTF = theano.function(inp_list_genTF,
                                           cost_genTF,
                                           name='f_eval')
            grads_genTF = tensor.grad(cost_genTF[0],
                                      wrt=modelGen.values(),
                                      add_names=True)
            lr_genTF = tensor.scalar(name='lr', dtype=config.floatX)
            f_grad_genTF, f_update_genTF, zg_genTF, rg_genTF, ud_genTF = solver.build_solver_model(
                lr_genTF, modelGen, grads_genTF, inp_list_genTF, cost_genTF,
                params)
    else:
        modelGen = []
        updatesLstm = []

    if params['met_to_track'] != []:
        trackMetargs = {'eval_metric': params['met_to_track']}
        refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track'])
        trackMetargs['refToks'] = refToks
        trackMetargs['scr_info'] = scr_info

    # Initialize the evalator model
    if params['share_Wemb']:
        evaluator = decodeEvaluator(params, modelGen['Wemb'])
    else:
        evaluator = decodeEvaluator(params)
    modelEval = evaluator.model_th

    if params['t_eval_only'] == 0:
        # Build the evaluator graph to evaluate reference and generated captions
        if params.get('upd_eval_ref', 0):
            (refeval_inp_list, ref_f_pred_fns, ref_costs, ref_predTh,
             ref_modelEval) = evaluator.build_advers_eval(modelEval, params)
        (eval_inp_list, f_pred_fns, costs, predTh,
         modelEval) = evaluator.build_advers_eval(modelEval, params,
                                                  gen_inp_list, gen_out,
                                                  updatesLstm, seq_lengths)
    else:
        # Build the evaluator graph to evaluate only reference captions
        (eval_inp_list, f_pred_fns, costs, predTh,
         modelEval) = evaluator.build_advers_eval(modelEval, params)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    if params['t_eval_only'] == 0:  # and 0:
        if params['checkpoint_file_name'] == 'None':
            modelGen['bd'].set_value(bias_init_vector.astype(config.floatX))
            if params.get('class_out_factoring', 0) == 1:
                modelGen['bdCls'].set_value(
                    bias_init_inter_class.astype(config.floatX))

    comb_inp_list = eval_inp_list
    if params['t_eval_only'] == 0:
        for inp in gen_inp_list:
            if inp not in comb_inp_list:
                comb_inp_list.append(inp)

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation or debug purposes
    if params['t_eval_only'] == 0:
        f_eval = theano.function(comb_inp_list,
                                 costs[:1],
                                 name='f_eval',
                                 updates=updatesLstm)
    else:
        f_eval = theano.function(comb_inp_list, costs[:1], name='f_eval')

    if params['share_Wemb']:
        modelEval.pop('Wemb')
    if params['fix_Wemb']:
        upListGen.remove('Wemb')

    #-------------------------------------------------------------------------------------------------------------------------
    # Now let's build a gradient computation graph and update mechanism
    #-------------------------------------------------------------------------------------------------------------------------
    # First compute gradient on the evaluator params w.r.t cost
    if params.get('upd_eval_ref', 0):
        gradsEval_ref = tensor.grad(ref_costs[0],
                                    wrt=modelEval.values(),
                                    add_names=True)
    gradsEval = tensor.grad(costs[0], wrt=modelEval.values(), add_names=True)

    # Update functions for the evaluator
    lrEval = tensor.scalar(name='lrEval', dtype=config.floatX)
    if params.get('upd_eval_ref', 0):
        f_grad_comp_eval_ref, f_param_update_eval_ref, _, _, _ = solver.build_solver_model(
            lrEval,
            modelEval,
            gradsEval_ref,
            refeval_inp_list,
            ref_costs[0],
            params,
            w_clip=params['eval_w_clip'])
    f_grad_comp_eval, f_param_update_eval, zg_eval, rg_eval, ud_eval = solver.build_solver_model(
        lrEval,
        modelEval,
        gradsEval,
        comb_inp_list,
        costs[:1],
        params,
        updatesLstm,
        w_clip=params['eval_w_clip'])

    # Now compute gradient on the generator params w.r.t the cost
    if params['t_eval_only'] == 0:
        gradsGen = tensor.grad(costs[1], wrt=modelGen.values(), add_names=True)
        lrGen = tensor.scalar(name='lrGen', dtype=config.floatX)
        # Update functions for the generator
        f_grad_comp_gen, f_param_update_gen, zg_gen, rg_gen, ud_gen = solver.build_solver_model(
            lrGen, modelGen, gradsGen,
            comb_inp_list[:(len(comb_inp_list) - 1 +
                            params['gen_feature_matching'])], costs[1], params,
            updatesLstm)

    #-------------------------------------------------------------------------------------------------------------------------
    # If we want to track some metrics during the training, initialize stuff for that now
    #-------------------------------------------------------------------------------------------------------------------------
    print 'model init done.'
    if params['t_eval_only'] == 0:
        print 'Gen model has keys: ' + ', '.join(modelGen.keys())
    print 'Eval model has keys: ' + ', '.join(modelEval.keys())

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='images')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    skip_first = 20
    iters_eval = 5
    iters_gen = 1

    cost_eval_iter = []
    cost_gen_iter = []
    trackSc_array = []

    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_ppl2 = -1
    smooth_train_ppl2 = 0.5  # initially size of dictionary of confusion
    smooth_train_cost = 0.0  # initially size of dictionary of confusion
    smooth_train_cost_gen = 1.0  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []
    write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
    iter_out_file = os.path.join(
        'logs', 'advmodel_checkpoint_%s_%s_%s_log.npz' %
        (params['dataset'], host, params['fappend']))

    len_hist = defaultdict(int)
    t_print_sec = 30
    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        if params['t_eval_only'] != 1:
            print '\n Now initing gen Model:'
            zipp(model_init_gen_from, modelGen)
        if 'trackers' in checkpoint_init:
            trackSc_array = checkpoint_init['trackers'].get('trackScores', [])
        print '\n Now initing Eval Model:'
        zipp(model_init_eval_from, modelEval)
        #zipp(rg_init,rgGen)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))

    ##############################################################
    # Define signal handler to catch ctl-c or kills so that we can save the model trained till that point
    def signal_handler(signal, frame):
        print('You pressed Ctrl+C! Saving Checkpoint Now before exiting!')
        filename = 'advmodel_checkpoint_%s_%s_%s_%.2f_INT.p' % (
            params['dataset'], host, params['fappend'], val_ppl2)
        dumpCheckpoint(filename, params, modelGen, modelEval, misc, it,
                       val_ppl2)
        sys.exit(0)

    #signal.signal(signal.SIGINT, signal_handler)
    ##############################################################

    #In testing disable sampling and use the greedy approach!?
    generator.usegumbel.set_value(1)
    if params['met_to_track'] != []:
        tsc_max, tsc_mean, tsc_min = eval_gen_samps(f_gen_only, dp, params,
                                                    misc, params['rev_eval'],
                                                    **trackMetargs)
        trackSc_array.append((0, {
            evm + '_max': tsc_max[i]
            for i, evm in enumerate(params['met_to_track'])
        }))
        trackSc_array[-1][1].update({
            evm + '_mean': tsc_mean[i]
            for i, evm in enumerate(params['met_to_track'])
        })
        trackSc_array[-1][1].update({
            evm + '_min': tsc_min[i]
            for i, evm in enumerate(params['met_to_track'])
        })

    disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5)
    evaluator.use_noise.set_value(1.)
    eval_acc, gen_acc = eval_discrm_gen('val', dp, params, f_pred_fns[0], misc)
    # Re-enable sampling
    generator.usegumbel.set_value(1)

    np.savez(iter_out_file,
             eval_cost=np.array(cost_eval_iter),
             gen_cost=np.array(cost_gen_iter),
             tracksc=np.array(trackSc_array))
    smooth_train_cost = 0.0

    print '###################### NOW BEGINNING TRAINING #################################'

    for it in xrange(max_iters):
        t0 = time.time()
        # Enable using dropout in training
        evaluator.use_noise.set_value(1.)
        dt = 0.
        it2 = 0
        while eval_acc <= 60. or gen_acc >= 45. or it2 < iters_eval * skip_first:
            # fetch a batch of data
            t1 = time.time()

            s_probs = [
                0.6, 0.4, 0.0
            ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0]
            batch = dp.sampAdversBatch(batch_size,
                                       n_sent=params['n_gen_samples'],
                                       probs=s_probs)
            cnn_inps = prepare_adv_data(batch,
                                        misc['wordtoix'],
                                        maxlen=params['maxlen'],
                                        prep_for=params['eval_model'])

            enc_inp_list = prepare_seq_features(
                batch,
                use_enc_for=params['use_encoder_for'],
                maxlen=params['maxlen'],
                use_shared_mem=params['use_shared_mem_enc'],
                enc_gt_sent=params['encode_gt_sentences'],
                n_enc_sent=params['n_encgt_sent'],
                wordtoix=misc['wordtoix'])
            eval_cost = f_grad_comp_eval(*(cnn_inps + enc_inp_list))

            if np.isnan(eval_cost[0]):
                import pdb
                pdb.set_trace()
            f_param_update_eval(params['learning_rate_eval'])

            # Track training statistics
            smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * eval_cost[
                0] if it > 0 else eval_cost[0]
            dt2 = time.time() - t1
            if it2 % 500 == 499:
                gb = 0.  #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0
                print 'Iter %d/%d Eval Only Iter %d/%d, done. in %.3fs. Eval Cost is %.6f' % (
                    it, max_iters, it2, iters_eval * skip_first, dt2,
                    smooth_train_cost)
            if it2 % 100 == 99:
                eval_acc, gen_acc = eval_discrm_gen('val',
                                                    dp,
                                                    params,
                                                    f_pred_fns[0],
                                                    misc,
                                                    n_eval=500)
            it2 += 1

        evaluator.use_noise.set_value(1.)

        if it >= 0:
            skip_first = 1
        if it >= 100:
            skip_first = 1
        if it % 1000 == 999:
            skip_first = 1

        s_probs = [
            1.0, 0.0, 0.0
        ] if params['eval_loss'] == 'contrastive' else [1.0, 0.0, 0.0]
        batch = dp.sampAdversBatch(batch_size,
                                   n_sent=params['n_gen_samples'],
                                   probs=s_probs)
        cnn_inps = prepare_adv_data(batch,
                                    misc['wordtoix'],
                                    maxlen=params['maxlen'],
                                    prep_for=params['eval_model'])
        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=params['use_encoder_for'],
            maxlen=params['maxlen'],
            use_shared_mem=params['use_shared_mem_enc'],
            enc_gt_sent=params['encode_gt_sentences'],
            n_enc_sent=params['n_encgt_sent'],
            wordtoix=misc['wordtoix'])

        gen_cost = f_grad_comp_gen(
            *(cnn_inps[:(len(cnn_inps) - 1 + params['gen_feature_matching'])] +
              enc_inp_list))
        f_param_update_gen(params['learning_rate_gen'])

        if params['use_mle_train']:
            generator.usegumbel.set_value(0)
            batch, l = dp.getRandBatchByLen(batch_size)
            gen_inp_list, lenS = prepare_data(batch, misc['wordtoix'],
                                              params['maxlen'])
            cost_genMLE = f_grad_genTF(*gen_inp_list)
            f_update_genTF(np.float32(params['learning_rate_gen'] / 50.0))
            generator.usegumbel.set_value(1)

        dt = time.time() - t0
        # print training statistics
        smooth_train_cost_gen = gen_cost if it == 0 else 0.99 * smooth_train_cost_gen + 0.01 * gen_cost

        tnow = time.time()
        if tnow > last_status_write_time + t_print_sec * 1:  # every now and then lets write a report
            gb = 0.  #modelGen['gumb_temp'].get_value() if params['use_gumbel_mse'] == 1 else 0
            print 'Iter %d/%d done. in %.3fs. Eval Cost is %.6f, Gen Cost is %.6f, temp: %.4f' % (it, max_iters, dt, \
             smooth_train_cost, smooth_train_cost_gen, gb)
            last_status_write_time = tnow

        cost_eval_iter.append(smooth_train_cost)
        cost_gen_iter.append(smooth_train_cost_gen)

        if it % 500 == 499:
            # Run the generator on the validation set and compute some metrics
            generator.usegumbel.set_value(1)
            if params['met_to_track'] != []:
                #In testing set the temperature to very low, so that it is equivalent to Greed samples
                tsc_max, tsc_mean, tsc_min = eval_gen_samps(
                    f_gen_only, dp, params, misc, params['rev_eval'],
                    **trackMetargs)
                trackSc_array.append((it, {
                    evm + '_max': tsc_max[i]
                    for i, evm in enumerate(params['met_to_track'])
                }))
                trackSc_array[-1][1].update({
                    evm + '_mean': tsc_mean[i]
                    for i, evm in enumerate(params['met_to_track'])
                })
                trackSc_array[-1][1].update({
                    evm + '_min': tsc_min[i]
                    for i, evm in enumerate(params['met_to_track'])
                })

            disp_some_gen_samps(f_gen_only, dp, params, misc, n_samp=5)
            generator.usegumbel.set_value(1)
            # if we beat a previous record or if this is the first time
            # AND we also beat the user-defined threshold or it doesnt exist
            top_val_ppl2 = gen_acc
        if it % 500 == 499:
            eval_acc, gen_acc = eval_discrm_gen('val',
                                                dp,
                                                params,
                                                f_pred_fns[0],
                                                misc,
                                                n_eval=500)
        if it % 1000 == 999:
            filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_genacc.p' % (
                params['dataset'], host, params['fappend'], it, gen_acc)
            dumpCheckpoint(filename, params, modelGen, modelEval, misc, it,
                           gen_acc)
        if it % 500 == 499:
            np.savez(iter_out_file,
                     eval_cost=np.array(cost_eval_iter),
                     gen_cost=np.array(cost_gen_iter),
                     tracksc=np.array(trackSc_array))

    # AND we also beat the user-defined threshold or it doesnt exist
    filename = 'advmodel_checkpoint_%s_%s_%s_%d_%.2f_GenDone.p' % (
        params['dataset'], host, params['fappend'], it, g_acc)
    dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, g_acc)
def main(params):

    # load the checkpoint
    checkpoint_path = params['checkpoint_path']
    print 'loading checkpoint %s' % (checkpoint_path, )
    checkpoint = pickle.load(open(checkpoint_path, 'rb'))
    cp_params = checkpoint['params']

    if params['gen_model'] == None:
        model_npy = checkpoint[
            'model'] if 'model' in checkpoint else checkpoint['modelGen']
    else:
        gen_cp = pickle.load(open(params['gen_model'], 'rb'))
        model_npy = gen_cp.get('model', {})

    cp_params['use_theano'] = 1
    if params['dobeamsearch']:
        cp_params['advers_gen'] = 0

    if params['use_label_file'] == 1:
        params['poolmethod'] = cp_params['poolmethod'] if params[
            'poolmethod'] == None else params['poolmethod']
        params['labels'] = cp_params['labels'] if params[
            'labels'] == None else params['labels']
        params['featfromlbl'] = cp_params['featfromlbl'] if params[
            'featfromlbl'] == None else params['featfromlbl']
        params['uselabel'] = cp_params['uselabel'] if params[
            'uselabel'] == None else params['uselabel']
    else:
        params['uselabel'] = 0
    print 'parsed parameters:'
    print json.dumps(params, indent=2)

    if 'image_feat_size' not in cp_params:
        cp_params['image_feat_size'] = 4096

    if 'misc' in checkpoint:
        misc = checkpoint['misc']
        ixtoword = misc['ixtoword']
    else:
        misc = {}
        ixtoword = checkpoint['ixtoword']
        misc['wordtoix'] = checkpoint['wordtoix']

    cp_params['softmax_smooth_factor'] = params['softmax_smooth_factor']
    cp_params['softmax_propogate'] = params['softmax_propogate']
    cp_params['computelogprob'] = params['computelogprob']
    cp_params['greedy'] = params['greedy']
    cp_params['gen_input_noise'] = 0

    if cp_params.get('sched_sampling_mode', None) != None:
        cp_params['sched_sampling_mode'] = None

    # load the tasks.txt file and setupe feature loading
    root_path = params['root_path']

    img_names_list = open(params['imgList'], 'r').read().splitlines()
    auxidxes = []

    img_names = [x.rsplit(',')[0] for x in img_names_list]

    if len(img_names_list[0].split(',', 1)) > 1:
        if type(ast.literal_eval(img_names_list[0].split(
                ',', 1)[1].strip())) == tuple:
            idxes = [
                ast.literal_eval(x.split(',', 1)[1].strip())[0]
                for x in img_names_list
            ]
            auxidxes = [
                ast.literal_eval(x.split(',', 1)[1].strip())[1]
                for x in img_names_list
            ]
        else:
            idxes = [
                ast.literal_eval(x.split(',', 1)[1].strip())
                for x in img_names_list
            ]
    else:
        idxes = xrange(len(img_names_list))

    if cp_params.get('swap_aux') == 0 or auxidxes == []:
        features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures(
            params, idxes, auxidxes=auxidxes)
    else:
        features, aux_inp, feat_idx, aux_idx = loadArbitraryFeatures(
            params, auxidxes, auxidxes=idxes)

    ##-------------------------------- Setup the models --------------------------###########
    if cp_params.get('use_encoder_for', 0) & 1:
        imgFeatEncoder = RecurrentFeatEncoder(cp_params['image_feat_size'],
                                              cp_params['word_encoding_size'],
                                              cp_params,
                                              mdl_prefix='img_enc_',
                                              features=features.T)

        zipp(model_npy, imgFeatEncoder.model_th)
        (imgenc_use_dropout, imgFeatEnc_inp, xI,
         updatesLSTMImgFeat) = imgFeatEncoder.build_model(
             imgFeatEncoder.model_th, cp_params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if cp_params.get('use_encoder_for', 0) & 2:
        auxFeatEncoder = RecurrentFeatEncoder(cp_params['aux_inp_size'],
                                              cp_params['image_encoding_size'],
                                              cp_params,
                                              mdl_prefix='aux_enc_',
                                              features=aux_inp.T)
        zipp(model_npy, auxFeatEncoder.model_th)
        (auxenc_use_dropout, auxFeatEnc_inp, xAux,
         updatesLSTMAuxFeat) = auxFeatEncoder.build_model(
             auxFeatEncoder.model_th, cp_params)
    else:
        auxFeatEnc_inp = []
        xAux = None

    # Testing to see if diversity can be achieved by weighing words
    if params['word_freq_w'] != None:
        w_freq = json.load(open(params['word_freq_w'], 'r'))
        w_logw = np.zeros(len(misc['wordtoix']), dtype=np.float32)
        for w in w_freq:
            if w in misc['wordtoix']:
                w_logw[misc['wordtoix'][w]] = w_freq[w]
        w_logw = w_logw / w_logw[1:].min()
        w_logw[0] = w_logw.max()
        w_logw = -params['word_freq_sc'] * np.log(w_logw)
    else:
        w_logw = None

    BatchGenerator = decodeGenerator(cp_params)
    # Compile and init the theano predictor
    BatchGenerator.prepPredictor(model_npy,
                                 cp_params,
                                 params['beam_size'],
                                 xI,
                                 xAux,
                                 imgFeatEnc_inp + auxFeatEnc_inp,
                                 per_word_logweight=w_logw)
    model = BatchGenerator.model_th
    if params['greedy']:
        BatchGenerator.usegumbel.set_value(0)

    # output blob which we will dump to JSON for visualizing the results
    blob = {}
    blob['params'] = params
    blob['checkpoint_params'] = copy(cp_params)
    if cp_params.get('class_out_factoring', 0) == 1:
        blob['checkpoint_params'].pop('ixtoclsinfo')
    blob['imgblobs'] = []

    N = len(img_names)

    # iterate over all images and predict sentences
    print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \
      checkpoint['perplexity']))

    kwparams = {}

    jsonFname = 'result_struct_%s.json' % (params['fname_append'])
    save_file = os.path.join(root_path, jsonFname)

    for n in xrange(N):
        print 'image %d/%d:' % (n, N)

        # encode the image
        D, NN = features.shape
        img = {}
        img['feat'] = features[:, feat_idx[n]].T
        img['img_idx'] = feat_idx[n]
        if cp_params.get('en_aux_inp', 0):
            img['aux_inp'] = aux_inp(
                aux_idx[n]) if aux_inp != [] else np.zeros(
                    cp_params['aux_inp_size'], dtype=np.float32)
            img['aux_idx'] = aux_idx[n] if aux_inp != [] else []
        img['local_file_path'] = img_names[n]
        # perform the work. heavy lifting happens inside
        enc_inp_list = prepare_seq_features(
            [{
                'image': img
            }],
            use_enc_for=cp_params.get('use_encoder_for', 0),
            use_shared_mem=cp_params.get('use_shared_mem_enc', 0))
        #import pdb;pdb.set_trace()
        Ys, Ax = BatchGenerator.predict([{
            'image': img
        }],
                                        cp_params,
                                        ext_inp=enc_inp_list)

        # build up the output
        img_blob = {}
        img_blob['img_path'] = img['local_file_path']

        # encode the top prediction
        top_predictions = Ys[0] if params[
            'rescoreByLen'] == 0 else rescoreProbByLen(
                Ys[0]
            )  # take predictions for the first (and only) image we passed in
        top_predictions = sorted(top_predictions,
                                 key=lambda aa: aa[0],
                                 reverse=True)

        top_prediction = top_predictions[
            0]  # these are sorted with highest on top
        if cp_params.get('reverse_sentence', 0) == 0:
            candidate = ' '.join([
                ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0
            ])  # ix 0 is the END token, skip that
        else:
            candidate = ' '.join([
                ixtoword[int(ix)] for ix in reversed(top_prediction[1])
                if ix > 0
            ])  # ix 0 is the END token, skip that
        #if candidate == '':
        #    import pdb;pdb.set_trace()
        if params['rescoreByLen'] == 0:
            print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate)
        else:
            print 'PRED: (%f, %f) %s' % (float(
                top_prediction[0]), float(top_prediction[2]), candidate)
        img_blob['candidate'] = {
            'text': candidate,
            'logprob': float(top_prediction[0])
        }

        # Code to save all the other candidates
        candlist = []
        for ci in xrange(len(top_predictions) - 1):
            prediction = top_predictions[
                ci + 1]  # these are sorted with highest on top
            candidate = ' '.join([
                ixtoword[int(ix)] for ix in prediction[1] if ix > 0
            ])  # ix 0 is the END token, skip that
            candlist.append({
                'text': candidate,
                'logprob': float(prediction[0])
            })

        img_blob['candidatelist'] = candlist
        blob['imgblobs'].append(img_blob)
        if (n % 5000) == 1:
            print 'writing predictions to %s...' % (save_file, )
            json.dump(blob, open(save_file, 'w'))

    # dump result struct to file
    print 'writing predictions to %s...' % (save_file, )
    json.dump(blob, open(save_file, 'w'))
Beispiel #17
0
def main(params):
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)
    # Initialize the optimizer
    solver = Solver(params['solver'])

    params['image_feat_size'] = dp.img_feat_size
    params['aux_inp_size'] = dp.aux_inp_size

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)

    if params['fine_tune'] == 1:
        params['mode'] = 'multi_choice_mode' if params[
            'mc_mode'] == 1 else 'multimodal_lstm'
        if params['checkpoint_file_name'] != None:
            #params['batch_size'] = dp.dataset['batchsize']
            misc['wordtoix'] = checkpoint_init['wordtoix']
            misc['ixtoword'] = checkpoint_init['ixtoword']
        batch_size = 1
        num_sentences_total = dp.getSplitSize('train', ofwhat='images')
    else:
        params['mode'] = 'batchtrain'
        batch_size = params['batch_size']
        num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')

    params['vocabulary_size'] = len(misc['wordtoix'])
    pos_samp = np.arange(batch_size, dtype=np.int32)

    # This initializes the model parameters and does matrix initializations
    evalModel = decodeEvaluator(params)
    model, misc['update'], misc['regularize'] = (evalModel.model_th,
                                                 evalModel.updateP,
                                                 evalModel.regularize)

    #----------------- If we are using feature encoders -----------------------
    if params['use_encoder_for'] & 1:
        imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                              params['sent_encoding_size'],
                                              params,
                                              mdl_prefix='img_enc_',
                                              features=dp.features.T)
        mdlLen = len(model.keys())
        model.update(imgFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(imgFeatEncoder.model_th.keys())))
        #misc['update'].extend(imgFeatEncoder.update_list)
        misc['regularize'].extend(imgFeatEncoder.regularize)
        (imgenc_use_dropout, imgFeatEnc_inp, xI,
         updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_eval, miscOuts, cost, predTh,
     model) = evalModel.build_model(model,
                                    params,
                                    xI=xI,
                                    prior_inp_list=imgFeatEnc_inp)

    inp_list = imgFeatEnc_inp + inp_list_eval

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    if params['sim_minibatch'] > 0:
        f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list,
                                                    cost,
                                                    params['sim_minibatch'])
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, ag, inp_list, cost, params)
    else:
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
    max_iters = max_iters / inner_loop
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop))
    top_val_ppl2 = -1
    smooth_train_cost = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    smooth_error_rate = 100.
    error_rate = 0.
    prev_it = -1
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != None:
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))
    elif params['init_from_imagernn'] != None:
        # Initialize word vecs and image emb from generative model file
        rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
        model['Wemb'].set_value(rnnCv['model']['Wemb'])
        model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
        misc['wordtoix'] = rnnCv['wordtoix']
        misc['ixtoword'] = rnnCv['ixtoword']
        print(
            "\n Initialized Word embedding and Image embeddings from gen mode %s"
            % (params['init_from_imagernn']))

    write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']

    use_dropout.set_value(1.)
    #################### Main Loop ############################################
    for it in xrange(max_iters):
        t0 = time.time()

        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))

        # fetch a batch of data
        cost_inner = np.zeros((inner_loop, ), dtype=np.float32)
        if params['sim_minibatch'] > 0:
            for i_l in xrange(inner_loop):
                batch, pos_samp_sent = dp.sampPosNegSentSamps(
                    params['batch_size'], params['mode'], thresh=0.3)
                eval_inp_list, lenS = prepare_data(
                    batch,
                    misc['wordtoix'],
                    maxlen=params['maxlen'],
                    pos_samp=pos_samp,
                    prep_for=params['eval_model'],
                    use_enc_for=params['use_encoder_for'])
                if params['fine_tune'] == 1:
                    eval_inp_list.append(pos_samp_sent)
                cost_inner[i_l] = f_grad_accum(*eval_inp_list)
        else:
            batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],
                                                          params['mode'],
                                                          thresh=0.3)
            enc_inp_list = prepare_seq_features(
                batch,
                use_enc_for=params['use_encoder_for'],
                use_shared_mem=params['use_shared_mem_enc'])
            eval_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                maxlen=params['maxlen'],
                pos_samp=pos_samp,
                prep_for=params['eval_model'],
                use_enc_for=params['use_encoder_for'])
            if params['fine_tune'] == 1:
                eval_inp_list.append(pos_samp_sent)

        real_inp_list = enc_inp_list + eval_inp_list

        # Enable using dropout in training
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # Reset accumulated gradients to 0
        if params['sim_minibatch'] > 0:
            f_clr()
        #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
        #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
        #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
        #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
        #import pdb; pdb.set_trace()
        #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

        # print training statistics
        epoch = it * inner_loop * 1.0 / num_iters_one_epoch
        total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() *
                      (params['sim_minibatch'] > 0)) / (
                          1 + params['sim_minibatch'])
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_cost)
        if it == 0: smooth_train_cost = total_cost
        else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost
        error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size

        margin_strength = cost[2].sum()
        smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * (
            float(cost[1]) / batch_size) if it > 0 else 100.0 * (
                float(cost[1]) / batch_size)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\
                    'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \
                    epoch, smooth_train_cost, smooth_error_rate,
                    margin_strength, error_rate/(it-prev_it))
            error_rate = 0.
            prev_it = it
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set
            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (
                        params['eval_model'], params['dataset'], host,
                        params['fappend'], smooth_error_rate, val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e

            use_dropout.set_value(1.)
def main(params):
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)
  # Initialize the optimizer 
  solver = Solver(params['solver'])

  params['image_feat_size'] = dp.img_feat_size

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['use_dropout'] = 1 

  if params['fine_tune'] == 1:
    params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm'
    if params['checkpoint_file_name'] != None:
        params['batch_size'] = dp.dataset['batchsize']
        misc['wordtoix'] = checkpoint_init['wordtoix']
        misc['ixtoword'] = checkpoint_init['ixtoword']
    batch_size = 1
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'images')
  else:
    params['mode'] = 'batchtrain'
    batch_size = params['batch_size']
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  
  params['vocabulary_size'] = len(misc['wordtoix'])
  pos_samp = np.arange(batch_size,dtype=np.int32)

  # This initializes the model parameters and does matrix initializations  
  evalModel = decodeEvaluator(params)
  model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize)
  
  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     miscOuts, cost, predTh, model) = evalModel.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost, wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  if params['sim_minibatch'] > 0:
    f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch'])
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag,
                                      inp_list, cost, params)
  else: 
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  inner_loop =   params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
  max_iters = max_iters / inner_loop 
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop))
  top_val_ppl2 = -1
  smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != None:
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  elif params['init_from_imagernn'] != None:
    # Initialize word vecs and image emb from generative model file
    rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
    model['Wemb'].set_value(rnnCv['model']['Wemb'])
    model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
    misc['wordtoix'] = rnnCv['wordtoix']
    misc['ixtoword'] = rnnCv['ixtoword']
    print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn']))


  use_dropout.set_value(1.)
  #################### Main Loop ############################################
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    cost_inner = np.zeros((inner_loop,),dtype=np.float32)
    if params['sim_minibatch'] > 0:
        for i_l in xrange(inner_loop):
            batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) 
            real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
            if params['fine_tune'] == 1:
               real_inp_list.append(pos_samp_sent)
            cost_inner[i_l] = f_grad_accum(*real_inp_list)
    else:
        batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3)
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
        if params['fine_tune'] == 1:
           real_inp_list.append(pos_samp_sent)
    # Enable using dropout in training 
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0
   
    # Reset accumulated gradients to 0
    if params['sim_minibatch'] > 0:
        f_clr()
    #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
    #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
    #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
    #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
    #import pdb; pdb.set_trace()
    #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

    # print training statistics
    epoch = it*inner_loop * 1.0 / num_iters_one_epoch
    total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch'])
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_cost)
    if it == 0: smooth_train_cost = total_cost 
    else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \
		epoch, smooth_train_cost)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          #top_val_ppl2 = val_ppl2
          filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e

      use_dropout.set_value(1.)
def main(params):
  batch_size = params['batch_size']
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)

  params['aux_inp_size'] = dp.aux_inp_size
  params['image_feat_size'] = dp.img_feat_size

  print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size'])

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['vocabulary_size'] = len(misc['wordtoix'])
  params['output_size'] = len(misc['ixtoword']) # these should match though
  params['use_dropout'] = 1 

  # This initializes the model parameters and does matrix initializations  
  lstmGenerator = LSTMGenerator(params)
  model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize)
  
  # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
  # This is a bit of a hack, not happy about it
  model['bd'].set_value(bias_init_vector.astype(config.floatX))

  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost[0], wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
  #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  top_val_ppl2 = -1
  smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != 'None':
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    if params['sample_by_len'] == 0:
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
    else: 
        batch,l = dp.getRandBatchByLen(batch_size)
        len_hist[l] += 1

    if params['use_pos_tag'] != 'None':
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword'])
    else:    
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'])
    
    # Enable using dropout in training 
    use_dropout.set_value(1.)

    # evaluate cost, gradient and perform parameter update
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0

    # print training statistics
    train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2']
    smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
    if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out
    epoch = it * 1.0 / num_iters_one_epoch
    total_cost = cost[0]
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_ppl2)

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
		epoch, total_cost, smooth_train_ppl2)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['smooth_train_ppl2'] = smooth_train_ppl2
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      jstatus['train_ppl2'] = train_ppl2
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

        
      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          top_val_ppl2 = val_ppl2
          filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e
def main(params):

  # load the checkpoint
  if params['multi_model'] == 0:
    checkpoint_path = params['checkpoint_path']
    print 'loading checkpoint %s' % (checkpoint_path, )
    checkpoint = pickle.load(open(checkpoint_path, 'rb'))
    checkpoint_params = checkpoint['params']
    model_npy = checkpoint['model']
    checkpoint_params['use_theano'] = 1
    if 'image_feat_size' not in  checkpoint_params:
        checkpoint_params['image_feat_size'] = 4096 
    
    if 'misc' in checkpoint:
      misc = checkpoint['misc']
      ixtoword = misc['ixtoword']
    else:
      misc = {}
      ixtoword = checkpoint['ixtoword']
      misc['wordtoix'] = checkpoint['wordtoix']
    
    checkpoint_params['softmax_smooth_factor'] = params['softmax_smooth_factor']
    checkpoint_params['softmax_propogate'] = params['softmax_propogate']
    if checkpoint_params.get('class_out_factoring',0) == 1:
      checkpoint_params['ixtoclsinfo'] = np.zeros((checkpoint_params['nClasses'],2),dtype=np.int32)
      ixtoclsinfo = misc['ixtoclsinfo']
      checkpoint_params['ixtoclsinfo'][ixtoclsinfo[:,0]] = ixtoclsinfo[:,1:3]

    if checkpoint_params.get('sched_sampling_mode',None) !=None:
        checkpoint_params['sched_sampling_mode'] = None
    
    BatchGenerator = decodeGenerator(checkpoint_params)
    # Compile and init the theano predictor 
    BatchGenerator.prepPredictor(model_npy, checkpoint_params, params['beam_size'])
    model = BatchGenerator.model_th
  else:
    BatchGenerator = []
    model_npy = []
    modelTh = []
    checkpoint_params = []
    for i,checkpoint_path in enumerate(params['checkpoint_path']):
        checkpoint = pickle.load(open(checkpoint_path, 'rb'))
        model_npy.append(checkpoint['model'])
        checkpoint_params.append(checkpoint['params'])
        checkpoint_params[i]['use_theano'] = 1
        BatchGenerator.append(decodeGenerator(checkpoint_params[i]))
        zipp(model_npy[i],BatchGenerator[i].model_th)
        modelTh.append(BatchGenerator[i].model_th)
        modelTh[i]['comb_weight'] = 1.0/params['nmodels']
    
    BatchGenerator[0].prepMultiPredictor(modelTh,checkpoint_params,params['beam_size'],params['nmodels'])
  
  # output blob which we will dump to JSON for visualizing the results
  blob = {} 
  blob['params'] = params
  blob['checkpoint_params'] = copy(checkpoint_params)
  if checkpoint_params.get('class_out_factoring',0) == 1:
    blob['checkpoint_params'].pop('ixtoclsinfo')
  blob['imgblobs'] = []

  # load the tasks.txt file and setupe feature loading
  root_path = params['root_path']
  img_names_list = open(params['imgList'], 'r').read().splitlines()
  auxidxes = []

  if len(img_names_list[0].rsplit(',')) > 2:
    img_names = [x.rsplit(',')[0] for x in img_names_list]
    idxes = [int(x.rsplit(',')[1]) for x in img_names_list]
    auxidxes = [int(x.rsplit(',')[2]) for x in img_names_list]
  elif len(img_names_list[0].rsplit(',')) > 1:
    img_names = [x.rsplit(',')[0] for x in img_names_list]
    idxes = [int(x.rsplit(',')[1]) for x in img_names_list]
  else:
    img_names = img_names_list
    idxes = xrange(len(img_names_list))
  
  #if checkpoint_params.get('en_aux_inp',0) and (params.get('aux_inp_file','None') == 'None'):
  #  raise ValueError('ERROR: please specify auxillary input feature using --aux_inp_file')
  #  return
  # load the features for all images
  if checkpoint_params.get('swap_aux') == 0 or auxidxes == []:
    features, aux_inp = loadArbitraryFeatures(params, idxes, auxidxes=auxidxes)
  else:
    features, aux_inp = loadArbitraryFeatures(params, auxidxes, auxidxes=idxes)

  N = len(img_names) 

  # iterate over all images and predict sentences
  print("\nUsing model run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint['epoch'], \
    checkpoint['perplexity']))
  
  kwparams = { 'beam_size' : params['beam_size'] }
  
  jsonFname = 'result_struct_%s.json' % (params['fname_append'] ) 
  save_file = os.path.join(root_path, jsonFname)
  
  for n in xrange(N):
    print 'image %d/%d:' % (n, N)

    # encode the image
    if params['multi_model'] == 0:
        D,NN = features.shape
        img = {}
        img['feat'] = features[:, n]
        if checkpoint_params.get('en_aux_inp',0):
            img['aux_inp'] = aux_inp[:, n]
        img['local_file_path'] =img_names[n]
        # perform the work. heavy lifting happens inside
        Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    else:
        kwparams['nmodels'] = params['nmodels']
        batch = []
        for i in xrange(params['nmodels']):
            img = {}
            img['feat'] = features[i][:, n]
            if checkpoint_params[i].get('en_aux_inp',0):
                img['aux_inp'] = aux_inp[i][:, n]
            img['local_file_path'] =img_names[n]
            batch.append({'image':img})
        Ys = BatchGenerator[0].predictMulti(batch, checkpoint_params, **kwparams)

    # build up the output
    img_blob = {}
    img_blob['img_path'] = img['local_file_path']

    # encode the top prediction
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    if checkpoint_params.get('reverse_sentence',0) == 0:
        candidate = ' '.join([ixtoword[int(ix)] for ix in top_prediction[1] if ix > 0]) # ix 0 is the END token, skip that
    else:
        candidate = ' '.join([ixtoword[int(ix)] for ix in reversed(top_prediction[1]) if ix > 0]) # ix 0 is the END token, skip that
    print 'PRED: (%f) %s' % (float(top_prediction[0]), candidate)
    img_blob['candidate'] = {'text': candidate, 'logprob': float(top_prediction[0])}    

    # Code to save all the other candidates 
    candlist = []
    for ci in xrange(len(top_predictions)-1):
        prediction = top_predictions[ci+1] # these are sorted with highest on top
        candidate = ' '.join([ixtoword[int(ix)] for ix in prediction[1] if ix > 0]) # ix 0 is the END token, skip that
        candlist.append({'text': candidate, 'logprob': float(prediction[0])})
    
    img_blob['candidatelist'] = candlist
    blob['imgblobs'].append(img_blob)
    if (n%5000) == 1:
        print 'writing predictions to %s...' % (save_file, )
        json.dump(blob, open(save_file, 'w'))

  # dump result struct to file
  print 'writing predictions to %s...' % (save_file, )
  json.dump(blob, open(save_file, 'w'))

  # dump output html
  html = ''
  for img in blob['imgblobs']:
    html += '<img src="%s" height="400"><br>' % (img['img_path'], )
    html += '(%f) %s <br><br>' % (img['candidate']['logprob'], img['candidate']['text'])

  html_file = 'result_%s.html' % (params['fname_append']) 
  html_file = os.path.join(root_path, html_file)
  print 'writing html result file to %s...' % (html_file, )
  open(html_file, 'w').write(html)
Beispiel #21
0
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    #--------------------------------- Init data provider and load data+features #---------------------------------#
    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = params['featenc_hidden_size'] * params[
        'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size
    params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[
        'encode_gt_sentences'] else params['aux_inp_size']

    params['image_feat_size'] = dp.img_feat_size
    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------#
    misc = {
    }  # stores various misc items that need to be passed around the framework
    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    if params['checkpoint_file_name'] == 'None':
        if params['class_out_factoring'] == 0:
            misc['wordtoix'], misc[
                'ixtoword'], bias_init_vector = preProBuildWordVocab(
                    dp.iterSentences('train'), word_count_threshold)
        else:
            [misc['wordtoix'], misc['classes']
             ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo']
                 ], [bias_init_vector, bias_init_inter_class
                     ] = preProBuildWordVocab(dp.iterSentences('train'),
                                              word_count_threshold, params)
            params['nClasses'] = bias_init_inter_class.shape[0]
            params['ixtoclsinfo'] = misc['ixtoclsinfo']
    else:
        misc = checkpoint_init['misc']
        params['nClasses'] = checkpoint_init['params']['nClasses']
        if 'ixtoclsinfo' in misc:
            params['ixtoclsinfo'] = misc['ixtoclsinfo']

    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    print len(misc['wordtoix']), len(misc['ixtoword'])

    #------------------------------ Initialize the solver/generator and build forward path #-----------------------#
    # Initialize the optimizer
    solver = Solver(params['solver'])
    # This initializes the model parameters and does matrix initializations
    lstmGenerator = decodeGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update_list,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack
    if params['checkpoint_file_name'] == 'None':
        model['bd'].set_value(bias_init_vector.astype(config.floatX))
        if params['class_out_factoring'] == 1:
            model['bdCls'].set_value(
                bias_init_inter_class.astype(config.floatX))

    #----------------- If we are using feature encoders -----------------------
    # This mode can now also be used for encoding GT sentences.
    if params['use_encoder_for'] & 1:
        if params['encode_gt_sentences']:
            xI = tensor.zeros((batch_size, params['image_encoding_size']))
            imgFeatEnc_inp = []
        else:
            imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                                  params['word_encoding_size'],
                                                  params,
                                                  mdl_prefix='img_enc_',
                                                  features=dp.features.T)
            mdlLen = len(model.keys())
            model.update(imgFeatEncoder.model_th)
            assert (len(model.keys()) == (mdlLen +
                                          len(imgFeatEncoder.model_th.keys())))
            misc['update'].extend(imgFeatEncoder.update_list)
            misc['regularize'].extend(imgFeatEncoder.regularize)
            (imgenc_use_dropout, imgFeatEnc_inp, xI,
             updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if params['use_encoder_for'] & 2:
        aux_enc_inp = model['Wemb'] if params[
            'encode_gt_sentences'] else dp.aux_inputs.T
        hid_size = params['featenc_hidden_size']
        auxFeatEncoder = RecurrentFeatEncoder(hid_size,
                                              params['image_encoding_size'],
                                              params,
                                              mdl_prefix='aux_enc_',
                                              features=aux_enc_inp)
        mdlLen = len(model.keys())
        model.update(auxFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(auxFeatEncoder.model_th.keys())))
        misc['update'].extend(auxFeatEncoder.update_list)
        misc['regularize'].extend(auxFeatEncoder.regularize)
        (auxenc_use_dropout, auxFeatEnc_inp, xAux,
         updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params)

        if params['encode_gt_sentences']:
            # Reshape it size(batch_size, n_gt, hidden_size)
            xAux = xAux.reshape(
                (-1, params['n_encgt_sent'], params['featenc_hidden_size']))
            # Convert it to size (batch_size, n_gt*hidden_size
            xAux = xAux.flatten(2)

    else:
        auxFeatEnc_inp = []
        xAux = None

    #--------------------------------- Initialize the Attention Network #-------------------------------#
    if params['use_attn'] != None:
        attnModel = AttentionNetwork(params['image_feat_size'],
                                     params['hidden_size'],
                                     params,
                                     mdl_prefix='attn_mlp_')
        mdlLen = len(model.keys())
        model.update(attnModel.model_th)
        assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys())))
        misc['update'].extend(attnModel.update_list)
        misc['regularize'].extend(attnModel.regularize)
        attn_nw_func = attnModel.build_model
    else:
        attn_nw_func = None

    #--------------------------------- Build the language model graph #---------------------------------#
    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_gen, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model,
                                              params,
                                              xI,
                                              xAux,
                                              attn_nw=attn_nw_func)

    inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen
    #--------------------------------- Cost function and gradient computations setup #---------------------------------#
    costGrad = cost[0]
    # Add class uncertainity to final cost
    #if params['class_out_factoring'] == 1:
    #  costGrad += cost[2]
    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        costGrad += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(costGrad, wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------#
    evalKwargs = {
        'eval_metric': params['eval_metric'],
        'f_gen': lstmGenerator.predict,
        'beamsize': params['eval_beamsize']
    }
    if params['eval_metric'] != 'perplex':
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric'])
        evalKwargs['refToks'] = refToks
        evalKwargs['scr_info'] = scr_info
        valMetOp = operator.gt
    else:
        valMetOp = operator.lt

    if params['met_to_track'] != []:
        trackMetargs = {
            'eval_metric': params['met_to_track'],
            'f_gen': lstmGenerator.predict,
            'beamsize': params['eval_beamsize']
        }
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track'])
        trackMetargs['refToks'] = refToks
        trackMetargs['scr_info'] = scr_info

    #--------------------------------- Iterations and Logging intializations ------------------------------------------#
    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_sc = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_sc = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    #json_worker_status['params'] = params
    json_worker_status['history'] = []
    len_hist = defaultdict(int)

    #Initialize Tracking the perplexity of train and val, with iters.
    train_perplex = []
    val_perplex = []
    trackSc_array = []

    #-------------------------------------- Load previously saved model ------------------------------------------------#
    #- Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        if params['restore_grads'] == 1:
            zipp(rg_init, rg)
        #Copy trackers from previous checkpoint
        if 'trackers' in checkpoint_init:
            train_perplex = checkpoint_init['trackers']['train_perplex']
            val_perplex = checkpoint_init['trackers']['val_perplex']
            trackSc_array = checkpoint_init['trackers'].get('trackScores', [])
        print(
            """\nContinuing training from previous model\n. Already run for %0.2f epochs with
            validation perplx at %0.3f\n""" %
            (checkpoint_init['epoch'], checkpoint_init['perplexity']))

    #--------------------------------------  MAIN LOOP ----------------------------------------------------------------#
    for it in xrange(max_iters):
        t0 = time.time()
        # Enable using dropout in training
        use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 2:
            auxenc_use_dropout.set_value(float(params['use_dropout']))

        epoch = it * 1.0 / num_iters_one_epoch
        #-------------------------------------- Prepare batch-------------------------------------------#
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=params['use_encoder_for'],
            maxlen=params['maxlen'],
            use_shared_mem=params['use_shared_mem_enc'],
            enc_gt_sent=params['encode_gt_sentences'],
            n_enc_sent=params['n_encgt_sent'],
            wordtoix=misc['wordtoix'])

        if params['use_pos_tag'] != 'None':
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                sentTagMap,
                misc['ixtoword'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])
        else:
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])

        if params['sched_sampling_mode'] != None:
            gen_inp_list.append(epoch)

        real_inp_list = enc_inp_list + gen_inp_list

        #import ipdb; ipdb.set_trace()
        #---------------------------------- Compute cost and apply gradients ---------------------------#
        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        # smooth exponentially decaying moving average
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out

        total_cost = cost[0]
        if it == 0: smooth_cost = total_cost  # start out where we start out
        smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost

        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        #---------------------------------- Write a report into a json file ---------------------------#
        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \
                    % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_sc'] = val_sc  # just write the last available one
            jstatus['val_metric'] = params[
                'eval_metric']  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            #if params['class_out_factoring'] == 1:
            #  jstatus['class_cost'] = float(cost[2])
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        #--------------------------------- VALIDATION ---------------------------#
        #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)
            if params['use_encoder_for'] & 2:
                auxenc_use_dropout.set_value(0.)

            # perform the evaluation on VAL set
            val_sc = eval_split_theano('val', dp, model, params, misc, f_eval,
                                       **evalKwargs)
            val_sc = val_sc[0]
            val_perplex.append((it, val_sc))
            train_perplex.append((it, smooth_train_ppl2))

            if params['met_to_track'] != []:
                track_sc = eval_split_theano('val', dp, model, params, misc,
                                             f_eval, **trackMetargs)
                trackSc_array.append((it, {
                    evm: track_sc[i]
                    for i, evm in enumerate(params['met_to_track'])
                }))

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation %s = %f, lr = %f' % (
                params['eval_metric'], val_sc, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            #----------------------------- SAVE THE MODEL -------------------#
            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if valMetOp(val_sc, top_val_sc) or top_val_sc < 0:
                if valMetOp(val_sc, write_checkpoint_ppl_threshold
                            ) or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_sc = val_sc
                    filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % (
                        params['dataset'], host, params['fappend'],
                        params['eval_metric'][:3], val_sc)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_sc
                    checkpoint['misc'] = misc
                    checkpoint['trackers'] = {
                        'train_perplex': train_perplex,
                        'val_perplex': val_perplex,
                        'trackScores': trackSc_array
                    }
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e