def dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2):
    filepath = os.path.join(params['checkpoint_output_directory'], filename)
    model_npy_gen = unzip(modelGen)
    model_npy_eval = unzip(modelEval)
    checkpoint = {}
    checkpoint['epoch'] = it
    checkpoint['modelGen'] = model_npy_gen
    checkpoint['modelEval'] = model_npy_eval
    checkpoint['params'] = params
    checkpoint['perplexity'] = val_ppl2
    checkpoint['misc'] = misc
    try:
        pickle.dump(checkpoint, open(filepath, "wb"))
        print 'saved checkpoint in %s' % (filepath, )
    except Exception, e:  # todo be more clever here
        print 'tried to write checkpoint into %s but got error: ' % (
            filepath, )
        print e
def dumpCheckpoint(filename, params, modelGen, modelEval, misc, it, val_ppl2):
  filepath = os.path.join(params['checkpoint_output_directory'], filename)
  model_npy_gen = unzip(modelGen)
  model_npy_eval = unzip(modelEval)
  checkpoint = {}
  checkpoint['epoch'] = it
  checkpoint['modelGen'] = model_npy_gen
  checkpoint['modelEval'] = model_npy_eval
  checkpoint['params'] = params
  checkpoint['perplexity'] = val_ppl2
  checkpoint['wordtoix'] = misc['wordtoix']
  checkpoint['ixtoword'] = misc['ixtoword']
  try:
    pickle.dump(checkpoint, open(filepath, "wb"))
    print 'saved checkpoint in %s' % (filepath, )
  except Exception, e: # todo be more clever here
    print 'tried to write checkpoint into %s but got error: ' % (filepath, )
    print e
def main(params):
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)
  # Initialize the optimizer 
  solver = Solver(params['solver'])

  params['image_feat_size'] = dp.img_feat_size

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['use_dropout'] = 1 

  if params['fine_tune'] == 1:
    params['mode'] = 'multimodal_lstm' if params['multimodal_lstm'] == 0 else 'multimodal_lstm'
    if params['checkpoint_file_name'] != None:
        params['batch_size'] = dp.dataset['batchsize']
        misc['wordtoix'] = checkpoint_init['wordtoix']
        misc['ixtoword'] = checkpoint_init['ixtoword']
    batch_size = 1
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'images')
  else:
    params['mode'] = 'batchtrain'
    batch_size = params['batch_size']
    num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  
  params['vocabulary_size'] = len(misc['wordtoix'])
  pos_samp = np.arange(batch_size,dtype=np.int32)

  # This initializes the model parameters and does matrix initializations  
  evalModel = decodeEvaluator(params)
  model, misc['update'], misc['regularize'] = (evalModel.model_th, evalModel.updateP, evalModel.regularize)
  
  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     miscOuts, cost, predTh, model) = evalModel.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost, wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  if params['sim_minibatch'] > 0:
    f_grad_accum, f_clr, ag = solver.accumGrads(model,grads,inp_list,cost, params['sim_minibatch'])
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, ag,
                                      inp_list, cost, params)
  else: 
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  inner_loop =   params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
  max_iters = max_iters / inner_loop 
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs/ inner_loop))
  top_val_ppl2 = -1
  smooth_train_cost = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != None:
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  elif params['init_from_imagernn'] != None:
    # Initialize word vecs and image emb from generative model file
    rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
    model['Wemb'].set_value(rnnCv['model']['Wemb'])
    model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
    misc['wordtoix'] = rnnCv['wordtoix']
    misc['ixtoword'] = rnnCv['ixtoword']
    print("\n Initialized Word embedding and Image embeddings from gen mode %s" % (params['init_from_imagernn']))


  use_dropout.set_value(1.)
  #################### Main Loop ############################################
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    cost_inner = np.zeros((inner_loop,),dtype=np.float32)
    if params['sim_minibatch'] > 0:
        for i_l in xrange(inner_loop):
            batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3) 
            real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
            if params['fine_tune'] == 1:
               real_inp_list.append(pos_samp_sent)
            cost_inner[i_l] = f_grad_accum(*real_inp_list)
    else:
        batch,pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],params['mode'],thresh=0.3)
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],maxlen=params['maxlen'],pos_samp=pos_samp,prep_for=params['eval_model'])
        if params['fine_tune'] == 1:
           real_inp_list.append(pos_samp_sent)
    # Enable using dropout in training 
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0
   
    # Reset accumulated gradients to 0
    if params['sim_minibatch'] > 0:
        f_clr()
    #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
    #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
    #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
    #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
    #import pdb; pdb.set_trace()
    #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

    # print training statistics
    epoch = it*inner_loop * 1.0 / num_iters_one_epoch
    total_cost = (np.e**-cost + (np.e**(-cost_inner)).sum()*(params['sim_minibatch'] > 0))/ (1 + params['sim_minibatch'])
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_cost)
    if it == 0: smooth_train_cost = total_cost 
    else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.3f' % (it, max_iters, dt, \
		epoch, smooth_train_cost)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          #top_val_ppl2 = val_ppl2
          filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (params['eval_model'], params['dataset'], host, params['fappend'],val_ppl2,smooth_train_cost)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e

      use_dropout.set_value(1.)
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = dp.aux_inp_size
    params['image_feat_size'] = dp.img_feat_size

    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)
    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    params['use_dropout'] = 1

    # This initializes the model parameters and does matrix initializations
    lstmGenerator = LSTMGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack, not happy about it
    model['bd'].set_value(bias_init_vector.astype(config.floatX))

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model, params)

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_ppl2 = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))

    for it in xrange(max_iters):
        t0 = time.time()
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        if params['use_pos_tag'] != 'None':
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'], None,
                                               sentTagMap, misc['ixtoword'])
        else:
            real_inp_list, lenS = prepare_data(batch, misc['wordtoix'])

        # Enable using dropout in training
        use_dropout.set_value(1.)

        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2  # smooth exponentially decaying moving average
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out
        epoch = it * 1.0 / num_iters_one_epoch
        total_cost = cost[0]
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
        epoch, total_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            if params['sample_by_len'] == 1:
                print len_hist

            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (
                        params['dataset'], host, params['fappend'], val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e
def main(params):
  batch_size = params['batch_size']
  word_count_threshold = params['word_count_threshold']
  max_epochs = params['max_epochs']
  host = socket.gethostname() # get computer hostname

  # fetch the data provider
  dp = getDataProvider(params)

  params['aux_inp_size'] = dp.aux_inp_size
  params['image_feat_size'] = dp.img_feat_size

  print 'Image feature size is %d, and aux input size is %d'%(params['image_feat_size'],params['aux_inp_size'])

  misc = {} # stores various misc items that need to be passed around the framework

  # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
  # at least word_count_threshold number of times
  misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold)
  params['vocabulary_size'] = len(misc['wordtoix'])
  params['output_size'] = len(misc['ixtoword']) # these should match though
  params['use_dropout'] = 1 

  # This initializes the model parameters and does matrix initializations  
  lstmGenerator = LSTMGenerator(params)
  model, misc['update'], misc['regularize'] = (lstmGenerator.model_th, lstmGenerator.update, lstmGenerator.regularize)
  
  # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
  # This is a bit of a hack, not happy about it
  model['bd'].set_value(bias_init_vector.astype(config.floatX))

  # Define the computational graph for relating the input image features and word indices to the
  # log probability cost funtion. 
  (use_dropout, inp_list,
     f_pred_prob, cost, predTh, updatesLSTM) = lstmGenerator.build_model(model, params)

  # Add the regularization cost. Since this is specific to trainig and doesn't get included when we 
  # evaluate the cost on test or validation data, we leave it here outside the model definition
  if params['regc'] > 0.:
      reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
      reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']), name='reg_c')
      reg_cost = 0.
      for p in misc['regularize']:
        reg_cost += (model[p] ** 2).sum()
        reg_cost *= 0.5 * reg_c 
      cost[0] += (reg_cost /params['batch_size'])
    
  # Compile an evaluation function.. Doesn't include gradients
  # To be used for validation set evaluation
  f_eval= theano.function(inp_list, cost, name='f_eval')

  # Now let's build a gradient computation graph and rmsprop update mechanism
  grads = tensor.grad(cost[0], wrt=model.values())
  lr = tensor.scalar(name='lr',dtype=config.floatX)
  f_grad_shared, f_update, zg, rg, ud = lstmGenerator.rmsprop(lr, model, grads,
                                      inp_list, cost, params)

  print 'model init done.'
  print 'model has keys: ' + ', '.join(model.keys())
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
  #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
  #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

  # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
  # Hence in case of coco/flickr this will 5* no of images
  num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences')
  num_iters_one_epoch = num_sentences_total / batch_size
  max_iters = max_epochs * num_iters_one_epoch
  eval_period_in_epochs = params['eval_period']
  eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs))
  top_val_ppl2 = -1
  smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion
  val_ppl2 = len(misc['ixtoword'])
  last_status_write_time = 0 # for writing worker job status reports
  json_worker_status = {}
  json_worker_status['params'] = params
  json_worker_status['history'] = []

  len_hist = defaultdict(int)
  
  ## Initialize the model parameters from the checkpoint file if we are resuming training
  if params['checkpoint_file_name'] != 'None':
    zipp(model_init_from,model)
    zipp(rg_init,rg)
    print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
      checkpoint_init['perplexity']))
  
  for it in xrange(max_iters):
    t0 = time.time()
    # fetch a batch of data
    if params['sample_by_len'] == 0:
        batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
    else: 
        batch,l = dp.getRandBatchByLen(batch_size)
        len_hist[l] += 1

    if params['use_pos_tag'] != 'None':
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'],None,sentTagMap,misc['ixtoword'])
    else:    
        real_inp_list, lenS = prepare_data(batch,misc['wordtoix'])
    
    # Enable using dropout in training 
    use_dropout.set_value(1.)

    # evaluate cost, gradient and perform parameter update
    cost = f_grad_shared(*real_inp_list)
    f_update(params['learning_rate'])
    dt = time.time() - t0

    # print training statistics
    train_ppl2 = (2**(cost[1]/lenS)) #step_struct['stats']['ppl2']
    smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average
    if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out
    epoch = it * 1.0 / num_iters_one_epoch
    total_cost = cost[0]
    #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
    #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
    #         train_ppl2, smooth_train_ppl2)

    tnow = time.time()
    if tnow > last_status_write_time + 60*1: # every now and then lets write a report
      print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' % (it, max_iters, dt, \
		epoch, total_cost, smooth_train_ppl2)
      last_status_write_time = tnow
      jstatus = {}
      jstatus['time'] = datetime.datetime.now().isoformat()
      jstatus['iter'] = (it, max_iters)
      jstatus['epoch'] = (epoch, max_epochs)
      jstatus['time_per_batch'] = dt
      jstatus['smooth_train_ppl2'] = smooth_train_ppl2
      jstatus['val_ppl2'] = val_ppl2 # just write the last available one
      jstatus['train_ppl2'] = train_ppl2
      json_worker_status['history'].append(jstatus)
      status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json')
      #import pdb; pdb.set_trace()
      try:
        json.dump(json_worker_status, open(status_file, 'w'))
      except Exception, e: # todo be more clever here
        print 'tried to write worker status into %s but got error:' % (status_file, )
        print e
    
    ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
    is_last_iter = (it+1) == max_iters
    if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter:
      # Disable using dropout in validation 
      use_dropout.set_value(0.)

      val_ppl2 = eval_split_theano('val', dp, model, params, misc,f_eval) # perform the evaluation on VAL set
      
      if epoch - params['lr_decay_st_epoch'] >= 0:
        params['learning_rate'] = params['learning_rate'] * params['lr_decay']
        params['lr_decay_st_epoch'] += 1
      
      print 'validation perplexity = %f, lr = %f' % (val_ppl2, params['learning_rate'])
      if params['sample_by_len'] == 1:
        print len_hist

        
      write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']
      if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
        if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
          # if we beat a previous record or if this is the first time
          # AND we also beat the user-defined threshold or it doesnt exist
          top_val_ppl2 = val_ppl2
          filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (params['dataset'], host, params['fappend'], val_ppl2)
          filepath = os.path.join(params['checkpoint_output_directory'], filename)
          model_npy = unzip(model)
          rgrads_npy = unzip(rg)
          checkpoint = {}
          checkpoint['it'] = it
          checkpoint['epoch'] = epoch
          checkpoint['model'] = model_npy
          checkpoint['rgrads'] = rgrads_npy
          checkpoint['params'] = params
          checkpoint['perplexity'] = val_ppl2
          checkpoint['wordtoix'] = misc['wordtoix']
          checkpoint['ixtoword'] = misc['ixtoword']
          try:
            pickle.dump(checkpoint, open(filepath, "wb"))
            print 'saved checkpoint in %s' % (filepath, )
          except Exception, e: # todo be more clever here
            print 'tried to write checkpoint into %s but got error: ' % (filepath, )
            print e
Exemple #6
0
def main(params):
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    # fetch the data provider
    dp = getDataProvider(params)
    # Initialize the optimizer
    solver = Solver(params['solver'])

    params['image_feat_size'] = dp.img_feat_size
    params['aux_inp_size'] = dp.aux_inp_size

    misc = {
    }  # stores various misc items that need to be passed around the framework

    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    misc['wordtoix'], misc[
        'ixtoword'], bias_init_vector = preProBuildWordVocab(
            dp.iterSentences('train'), word_count_threshold)

    if params['fine_tune'] == 1:
        params['mode'] = 'multi_choice_mode' if params[
            'mc_mode'] == 1 else 'multimodal_lstm'
        if params['checkpoint_file_name'] != None:
            #params['batch_size'] = dp.dataset['batchsize']
            misc['wordtoix'] = checkpoint_init['wordtoix']
            misc['ixtoword'] = checkpoint_init['ixtoword']
        batch_size = 1
        num_sentences_total = dp.getSplitSize('train', ofwhat='images')
    else:
        params['mode'] = 'batchtrain'
        batch_size = params['batch_size']
        num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')

    params['vocabulary_size'] = len(misc['wordtoix'])
    pos_samp = np.arange(batch_size, dtype=np.int32)

    # This initializes the model parameters and does matrix initializations
    evalModel = decodeEvaluator(params)
    model, misc['update'], misc['regularize'] = (evalModel.model_th,
                                                 evalModel.updateP,
                                                 evalModel.regularize)

    #----------------- If we are using feature encoders -----------------------
    if params['use_encoder_for'] & 1:
        imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                              params['sent_encoding_size'],
                                              params,
                                              mdl_prefix='img_enc_',
                                              features=dp.features.T)
        mdlLen = len(model.keys())
        model.update(imgFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(imgFeatEncoder.model_th.keys())))
        #misc['update'].extend(imgFeatEncoder.update_list)
        misc['regularize'].extend(imgFeatEncoder.regularize)
        (imgenc_use_dropout, imgFeatEnc_inp, xI,
         updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_eval, miscOuts, cost, predTh,
     model) = evalModel.build_model(model,
                                    params,
                                    xI=xI,
                                    prior_inp_list=imgFeatEnc_inp)

    inp_list = imgFeatEnc_inp + inp_list_eval

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        cost[0] += (reg_cost / params['batch_size'])

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(cost[0], wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    if params['sim_minibatch'] > 0:
        f_grad_accum, f_clr, ag = solver.accumGrads(model, grads, inp_list,
                                                    cost,
                                                    params['sim_minibatch'])
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, ag, inp_list, cost, params)
    else:
        f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
            lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())

    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    inner_loop = params['sim_minibatch'] if params['sim_minibatch'] > 0 else 1
    max_iters = max_iters / inner_loop
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs / inner_loop))
    top_val_ppl2 = -1
    smooth_train_cost = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    smooth_error_rate = 100.
    error_rate = 0.
    prev_it = -1
    val_ppl2 = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    json_worker_status['params'] = params
    json_worker_status['history'] = []

    len_hist = defaultdict(int)

    ## Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != None:
        zipp(model_init_from, model)
        zipp(rg_init, rg)
        print("\nContinuing training from previous model\n. Already run for %0.2f epochs with validation perplx at %0.3f\n" % (checkpoint_init['epoch'], \
          checkpoint_init['perplexity']))
    elif params['init_from_imagernn'] != None:
        # Initialize word vecs and image emb from generative model file
        rnnCv = pickle.load(open(params['init_from_imagernn'], 'rb'))
        model['Wemb'].set_value(rnnCv['model']['Wemb'])
        model['WIemb'].set_value(rnnCv['model']['WIemb_aux'])
        misc['wordtoix'] = rnnCv['wordtoix']
        misc['ixtoword'] = rnnCv['ixtoword']
        print(
            "\n Initialized Word embedding and Image embeddings from gen mode %s"
            % (params['init_from_imagernn']))

    write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold']

    use_dropout.set_value(1.)
    #################### Main Loop ############################################
    for it in xrange(max_iters):
        t0 = time.time()

        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))

        # fetch a batch of data
        cost_inner = np.zeros((inner_loop, ), dtype=np.float32)
        if params['sim_minibatch'] > 0:
            for i_l in xrange(inner_loop):
                batch, pos_samp_sent = dp.sampPosNegSentSamps(
                    params['batch_size'], params['mode'], thresh=0.3)
                eval_inp_list, lenS = prepare_data(
                    batch,
                    misc['wordtoix'],
                    maxlen=params['maxlen'],
                    pos_samp=pos_samp,
                    prep_for=params['eval_model'],
                    use_enc_for=params['use_encoder_for'])
                if params['fine_tune'] == 1:
                    eval_inp_list.append(pos_samp_sent)
                cost_inner[i_l] = f_grad_accum(*eval_inp_list)
        else:
            batch, pos_samp_sent = dp.sampPosNegSentSamps(params['batch_size'],
                                                          params['mode'],
                                                          thresh=0.3)
            enc_inp_list = prepare_seq_features(
                batch,
                use_enc_for=params['use_encoder_for'],
                use_shared_mem=params['use_shared_mem_enc'])
            eval_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                maxlen=params['maxlen'],
                pos_samp=pos_samp,
                prep_for=params['eval_model'],
                use_enc_for=params['use_encoder_for'])
            if params['fine_tune'] == 1:
                eval_inp_list.append(pos_samp_sent)

        real_inp_list = enc_inp_list + eval_inp_list

        # Enable using dropout in training
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # Reset accumulated gradients to 0
        if params['sim_minibatch'] > 0:
            f_clr()
        #print 'model: ' + ' '.join([str(np.isnan(model[m].get_value()).any()) for m in model])
        #print 'rg: ' +' '.join([str(np.isnan(rg[i].get_value()).any()) for i in xrange(len(rg))])
        #print 'zg: ' + ' '.join([str(np.isnan(zg[i].get_value()).any()) for i in xrange(len(zg))])
        #print 'ud: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])
        #import pdb; pdb.set_trace()
        #print 'udAft: ' + ' '.join([str(np.isnan(ud[i].get_value()).any()) for i in xrange(len(ud))])

        # print training statistics
        epoch = it * inner_loop * 1.0 / num_iters_one_epoch
        total_cost = (np.e**(-cost[0]) + (np.e**(-cost_inner)).sum() *
                      (params['sim_minibatch'] > 0)) / (
                          1 + params['sim_minibatch'])
        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_cost)
        if it == 0: smooth_train_cost = total_cost
        else: smooth_train_cost = 0.99 * smooth_train_cost + 0.01 * total_cost
        error_rate += 100.0 * float((cost[2] < 0.).sum()) / batch_size

        margin_strength = cost[2].sum()
        smooth_error_rate = 0.99 * smooth_error_rate + 0.01 * 100.0 * (
            float(cost[1]) / batch_size) if it > 0 else 100.0 * (
                float(cost[1]) / batch_size)

        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Prob now is %.4f, Error '\
                    'rate is %.3f%%, Margin %.2f, negMarg=%.2f' % (it, max_iters, dt, \
                    epoch, smooth_train_cost, smooth_error_rate,
                    margin_strength, error_rate/(it-prev_it))
            error_rate = 0.
            prev_it = it
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['val_ppl2'] = val_ppl2  # just write the last available one
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)

            val_ppl2 = eval_split_theano(
                'val', dp, model, params, misc,
                f_eval)  # perform the evaluation on VAL set
            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation perplexity = %f, lr = %f' % (
                val_ppl2, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0:
                if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_ppl2 = val_ppl2
                    filename = '%s_checkpoint_%s_%s_%s_%.2f_%.2f.p' % (
                        params['eval_model'], params['dataset'], host,
                        params['fappend'], smooth_error_rate, val_ppl2)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_ppl2
                    checkpoint['wordtoix'] = misc['wordtoix']
                    checkpoint['ixtoword'] = misc['ixtoword']
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e

            use_dropout.set_value(1.)
def main(params):
    batch_size = params['batch_size']
    word_count_threshold = params['word_count_threshold']
    max_epochs = params['max_epochs']
    host = socket.gethostname()  # get computer hostname

    #--------------------------------- Init data provider and load data+features #---------------------------------#
    # fetch the data provider
    dp = getDataProvider(params)

    params['aux_inp_size'] = params['featenc_hidden_size'] * params[
        'n_encgt_sent'] if params['encode_gt_sentences'] else dp.aux_inp_size
    params['featenc_hidden_size'] = params['featenc_hidden_size'] if params[
        'encode_gt_sentences'] else params['aux_inp_size']

    params['image_feat_size'] = dp.img_feat_size
    print 'Image feature size is %d, and aux input size is %d' % (
        params['image_feat_size'], params['aux_inp_size'])

    #--------------------------------- Preprocess sentences and build Vocabulary #---------------------------------#
    misc = {
    }  # stores various misc items that need to be passed around the framework
    # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur
    # at least word_count_threshold number of times
    if params['checkpoint_file_name'] == 'None':
        if params['class_out_factoring'] == 0:
            misc['wordtoix'], misc[
                'ixtoword'], bias_init_vector = preProBuildWordVocab(
                    dp.iterSentences('train'), word_count_threshold)
        else:
            [misc['wordtoix'], misc['classes']
             ], [misc['ixtoword'], misc['clstotree'], misc['ixtoclsinfo']
                 ], [bias_init_vector, bias_init_inter_class
                     ] = preProBuildWordVocab(dp.iterSentences('train'),
                                              word_count_threshold, params)
            params['nClasses'] = bias_init_inter_class.shape[0]
            params['ixtoclsinfo'] = misc['ixtoclsinfo']
    else:
        misc = checkpoint_init['misc']
        params['nClasses'] = checkpoint_init['params']['nClasses']
        if 'ixtoclsinfo' in misc:
            params['ixtoclsinfo'] = misc['ixtoclsinfo']

    params['vocabulary_size'] = len(misc['wordtoix'])
    params['output_size'] = len(misc['ixtoword'])  # these should match though
    print len(misc['wordtoix']), len(misc['ixtoword'])

    #------------------------------ Initialize the solver/generator and build forward path #-----------------------#
    # Initialize the optimizer
    solver = Solver(params['solver'])
    # This initializes the model parameters and does matrix initializations
    lstmGenerator = decodeGenerator(params)
    model, misc['update'], misc['regularize'] = (lstmGenerator.model_th,
                                                 lstmGenerator.update_list,
                                                 lstmGenerator.regularize)

    # force overwrite here. The bias to the softmax is initialized to reflect word frequencies
    # This is a bit of a hack
    if params['checkpoint_file_name'] == 'None':
        model['bd'].set_value(bias_init_vector.astype(config.floatX))
        if params['class_out_factoring'] == 1:
            model['bdCls'].set_value(
                bias_init_inter_class.astype(config.floatX))

    #----------------- If we are using feature encoders -----------------------
    # This mode can now also be used for encoding GT sentences.
    if params['use_encoder_for'] & 1:
        if params['encode_gt_sentences']:
            xI = tensor.zeros((batch_size, params['image_encoding_size']))
            imgFeatEnc_inp = []
        else:
            imgFeatEncoder = RecurrentFeatEncoder(params['image_feat_size'],
                                                  params['word_encoding_size'],
                                                  params,
                                                  mdl_prefix='img_enc_',
                                                  features=dp.features.T)
            mdlLen = len(model.keys())
            model.update(imgFeatEncoder.model_th)
            assert (len(model.keys()) == (mdlLen +
                                          len(imgFeatEncoder.model_th.keys())))
            misc['update'].extend(imgFeatEncoder.update_list)
            misc['regularize'].extend(imgFeatEncoder.regularize)
            (imgenc_use_dropout, imgFeatEnc_inp, xI,
             updatesLSTMImgFeat) = imgFeatEncoder.build_model(model, params)
    else:
        xI = None
        imgFeatEnc_inp = []

    if params['use_encoder_for'] & 2:
        aux_enc_inp = model['Wemb'] if params[
            'encode_gt_sentences'] else dp.aux_inputs.T
        hid_size = params['featenc_hidden_size']
        auxFeatEncoder = RecurrentFeatEncoder(hid_size,
                                              params['image_encoding_size'],
                                              params,
                                              mdl_prefix='aux_enc_',
                                              features=aux_enc_inp)
        mdlLen = len(model.keys())
        model.update(auxFeatEncoder.model_th)
        assert (len(model.keys()) == (mdlLen +
                                      len(auxFeatEncoder.model_th.keys())))
        misc['update'].extend(auxFeatEncoder.update_list)
        misc['regularize'].extend(auxFeatEncoder.regularize)
        (auxenc_use_dropout, auxFeatEnc_inp, xAux,
         updatesLSTMAuxFeat) = auxFeatEncoder.build_model(model, params)

        if params['encode_gt_sentences']:
            # Reshape it size(batch_size, n_gt, hidden_size)
            xAux = xAux.reshape(
                (-1, params['n_encgt_sent'], params['featenc_hidden_size']))
            # Convert it to size (batch_size, n_gt*hidden_size
            xAux = xAux.flatten(2)

    else:
        auxFeatEnc_inp = []
        xAux = None

    #--------------------------------- Initialize the Attention Network #-------------------------------#
    if params['use_attn'] != None:
        attnModel = AttentionNetwork(params['image_feat_size'],
                                     params['hidden_size'],
                                     params,
                                     mdl_prefix='attn_mlp_')
        mdlLen = len(model.keys())
        model.update(attnModel.model_th)
        assert (len(model.keys()) == (mdlLen + len(attnModel.model_th.keys())))
        misc['update'].extend(attnModel.update_list)
        misc['regularize'].extend(attnModel.regularize)
        attn_nw_func = attnModel.build_model
    else:
        attn_nw_func = None

    #--------------------------------- Build the language model graph #---------------------------------#
    # Define the computational graph for relating the input image features and word indices to the
    # log probability cost funtion.
    (use_dropout, inp_list_gen, f_pred_prob, cost, predTh,
     updatesLSTM) = lstmGenerator.build_model(model,
                                              params,
                                              xI,
                                              xAux,
                                              attn_nw=attn_nw_func)

    inp_list = imgFeatEnc_inp + auxFeatEnc_inp + inp_list_gen
    #--------------------------------- Cost function and gradient computations setup #---------------------------------#
    costGrad = cost[0]
    # Add class uncertainity to final cost
    #if params['class_out_factoring'] == 1:
    #  costGrad += cost[2]
    # Add the regularization cost. Since this is specific to trainig and doesn't get included when we
    # evaluate the cost on test or validation data, we leave it here outside the model definition
    if params['regc'] > 0.:
        reg_cost = theano.shared(numpy_floatX(0.), name='reg_c')
        reg_c = tensor.as_tensor_variable(numpy_floatX(params['regc']),
                                          name='reg_c')
        reg_cost = 0.
        for p in misc['regularize']:
            reg_cost += (model[p]**2).sum()
            reg_cost *= 0.5 * reg_c
        costGrad += (reg_cost / params['batch_size'])

    # Compile an evaluation function.. Doesn't include gradients
    # To be used for validation set evaluation
    f_eval = theano.function(inp_list, cost, name='f_eval')

    # Now let's build a gradient computation graph and rmsprop update mechanism
    grads = tensor.grad(costGrad, wrt=model.values())
    lr = tensor.scalar(name='lr', dtype=config.floatX)
    f_grad_shared, f_update, zg, rg, ud = solver.build_solver_model(
        lr, model, grads, inp_list, cost, params)

    print 'model init done.'
    print 'model has keys: ' + ', '.join(model.keys())
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update'])
    #print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize'])
    #print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), )

    #-------------------------------- Intialize the prediction path if needed by evaluator ----------------------------#
    evalKwargs = {
        'eval_metric': params['eval_metric'],
        'f_gen': lstmGenerator.predict,
        'beamsize': params['eval_beamsize']
    }
    if params['eval_metric'] != 'perplex':
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['eval_metric'])
        evalKwargs['refToks'] = refToks
        evalKwargs['scr_info'] = scr_info
        valMetOp = operator.gt
    else:
        valMetOp = operator.lt

    if params['met_to_track'] != []:
        trackMetargs = {
            'eval_metric': params['met_to_track'],
            'f_gen': lstmGenerator.predict,
            'beamsize': params['eval_beamsize']
        }
        lstmGenerator.prepPredictor(None, params, params['eval_beamsize'])
        refToks, scr_info = eval_prep_refs('val', dp, params['met_to_track'])
        trackMetargs['refToks'] = refToks
        trackMetargs['scr_info'] = scr_info

    #--------------------------------- Iterations and Logging intializations ------------------------------------------#
    # calculate how many iterations we need, One epoch is considered once going through all the sentences and not images
    # Hence in case of coco/flickr this will 5* no of images
    num_sentences_total = dp.getSplitSize('train', ofwhat='sentences')
    num_iters_one_epoch = num_sentences_total / batch_size
    max_iters = max_epochs * num_iters_one_epoch
    eval_period_in_epochs = params['eval_period']
    eval_period_in_iters = max(
        1, int(num_iters_one_epoch * eval_period_in_epochs))
    top_val_sc = -1
    smooth_train_ppl2 = len(
        misc['ixtoword'])  # initially size of dictionary of confusion
    val_sc = len(misc['ixtoword'])
    last_status_write_time = 0  # for writing worker job status reports
    json_worker_status = {}
    #json_worker_status['params'] = params
    json_worker_status['history'] = []
    len_hist = defaultdict(int)

    #Initialize Tracking the perplexity of train and val, with iters.
    train_perplex = []
    val_perplex = []
    trackSc_array = []

    #-------------------------------------- Load previously saved model ------------------------------------------------#
    #- Initialize the model parameters from the checkpoint file if we are resuming training
    if params['checkpoint_file_name'] != 'None':
        zipp(model_init_from, model)
        if params['restore_grads'] == 1:
            zipp(rg_init, rg)
        #Copy trackers from previous checkpoint
        if 'trackers' in checkpoint_init:
            train_perplex = checkpoint_init['trackers']['train_perplex']
            val_perplex = checkpoint_init['trackers']['val_perplex']
            trackSc_array = checkpoint_init['trackers'].get('trackScores', [])
        print(
            """\nContinuing training from previous model\n. Already run for %0.2f epochs with
            validation perplx at %0.3f\n""" %
            (checkpoint_init['epoch'], checkpoint_init['perplexity']))

    #--------------------------------------  MAIN LOOP ----------------------------------------------------------------#
    for it in xrange(max_iters):
        t0 = time.time()
        # Enable using dropout in training
        use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 1:
            imgenc_use_dropout.set_value(float(params['use_dropout']))
        if params['use_encoder_for'] & 2:
            auxenc_use_dropout.set_value(float(params['use_dropout']))

        epoch = it * 1.0 / num_iters_one_epoch
        #-------------------------------------- Prepare batch-------------------------------------------#
        # fetch a batch of data
        if params['sample_by_len'] == 0:
            batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)]
        else:
            batch, l = dp.getRandBatchByLen(batch_size)
            len_hist[l] += 1

        enc_inp_list = prepare_seq_features(
            batch,
            use_enc_for=params['use_encoder_for'],
            maxlen=params['maxlen'],
            use_shared_mem=params['use_shared_mem_enc'],
            enc_gt_sent=params['encode_gt_sentences'],
            n_enc_sent=params['n_encgt_sent'],
            wordtoix=misc['wordtoix'])

        if params['use_pos_tag'] != 'None':
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                sentTagMap,
                misc['ixtoword'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])
        else:
            gen_inp_list, lenS = prepare_data(
                batch,
                misc['wordtoix'],
                params['maxlen'],
                rev_sents=params['reverse_sentence'],
                use_enc_for=params['use_encoder_for'],
                use_unk_token=params['use_unk_token'])

        if params['sched_sampling_mode'] != None:
            gen_inp_list.append(epoch)

        real_inp_list = enc_inp_list + gen_inp_list

        #import ipdb; ipdb.set_trace()
        #---------------------------------- Compute cost and apply gradients ---------------------------#
        # evaluate cost, gradient and perform parameter update
        cost = f_grad_shared(*real_inp_list)
        f_update(params['learning_rate'])
        dt = time.time() - t0

        # print training statistics
        train_ppl2 = (2**(cost[1] / lenS))  #step_struct['stats']['ppl2']
        # smooth exponentially decaying moving average
        smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2
        if it == 0:
            smooth_train_ppl2 = train_ppl2  # start out where we start out

        total_cost = cost[0]
        if it == 0: smooth_cost = total_cost  # start out where we start out
        smooth_cost = 0.99 * smooth_cost + 0.01 * total_cost

        #print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \
        #      % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \
        #         train_ppl2, smooth_train_ppl2)

        #---------------------------------- Write a report into a json file ---------------------------#
        tnow = time.time()
        if tnow > last_status_write_time + 60 * 1:  # every now and then lets write a report
            print '%d/%d batch done in %.3fs. at epoch %.2f. Cost now is %.3f and pplx is %.3f' \
                    % (it, max_iters, dt, epoch, smooth_cost, smooth_train_ppl2)
            last_status_write_time = tnow
            jstatus = {}
            jstatus['time'] = datetime.datetime.now().isoformat()
            jstatus['iter'] = (it, max_iters)
            jstatus['epoch'] = (epoch, max_epochs)
            jstatus['time_per_batch'] = dt
            jstatus['smooth_train_ppl2'] = smooth_train_ppl2
            jstatus['val_sc'] = val_sc  # just write the last available one
            jstatus['val_metric'] = params[
                'eval_metric']  # just write the last available one
            jstatus['train_ppl2'] = train_ppl2
            #if params['class_out_factoring'] == 1:
            #  jstatus['class_cost'] = float(cost[2])
            json_worker_status['history'].append(jstatus)
            status_file = os.path.join(
                params['worker_status_output_directory'],
                host + '_status.json')
            #import pdb; pdb.set_trace()
            try:
                json.dump(json_worker_status, open(status_file, 'w'))
            except Exception, e:  # todo be more clever here
                print 'tried to write worker status into %s but got error:' % (
                    status_file, )
                print e

        #--------------------------------- VALIDATION ---------------------------#
        #- perform perplexity evaluation on the validation set and save a model checkpoint if it's good
        is_last_iter = (it + 1) == max_iters
        if (((it + 1) % eval_period_in_iters) == 0
                and it < max_iters - 5) or is_last_iter:
            # Disable using dropout in validation
            use_dropout.set_value(0.)
            if params['use_encoder_for'] & 1:
                imgenc_use_dropout.set_value(0.)
            if params['use_encoder_for'] & 2:
                auxenc_use_dropout.set_value(0.)

            # perform the evaluation on VAL set
            val_sc = eval_split_theano('val', dp, model, params, misc, f_eval,
                                       **evalKwargs)
            val_sc = val_sc[0]
            val_perplex.append((it, val_sc))
            train_perplex.append((it, smooth_train_ppl2))

            if params['met_to_track'] != []:
                track_sc = eval_split_theano('val', dp, model, params, misc,
                                             f_eval, **trackMetargs)
                trackSc_array.append((it, {
                    evm: track_sc[i]
                    for i, evm in enumerate(params['met_to_track'])
                }))

            if epoch - params['lr_decay_st_epoch'] >= 0:
                params['learning_rate'] = params['learning_rate'] * params[
                    'lr_decay']
                params['lr_decay_st_epoch'] += 1

            print 'validation %s = %f, lr = %f' % (
                params['eval_metric'], val_sc, params['learning_rate'])
            #if params['sample_by_len'] == 1:
            #  print len_hist

            #----------------------------- SAVE THE MODEL -------------------#
            write_checkpoint_ppl_threshold = params[
                'write_checkpoint_ppl_threshold']
            if valMetOp(val_sc, top_val_sc) or top_val_sc < 0:
                if valMetOp(val_sc, write_checkpoint_ppl_threshold
                            ) or write_checkpoint_ppl_threshold < 0:
                    # if we beat a previous record or if this is the first time
                    # AND we also beat the user-defined threshold or it doesnt exist
                    top_val_sc = val_sc
                    filename = 'model_checkpoint_%s_%s_%s_%s%.2f.p' % (
                        params['dataset'], host, params['fappend'],
                        params['eval_metric'][:3], val_sc)
                    filepath = os.path.join(
                        params['checkpoint_output_directory'], filename)
                    model_npy = unzip(model)
                    rgrads_npy = unzip(rg)
                    checkpoint = {}
                    checkpoint['it'] = it
                    checkpoint['epoch'] = epoch
                    checkpoint['model'] = model_npy
                    checkpoint['rgrads'] = rgrads_npy
                    checkpoint['params'] = params
                    checkpoint['perplexity'] = val_sc
                    checkpoint['misc'] = misc
                    checkpoint['trackers'] = {
                        'train_perplex': train_perplex,
                        'val_perplex': val_perplex,
                        'trackScores': trackSc_array
                    }
                    try:
                        pickle.dump(checkpoint, open(filepath, "wb"))
                        print 'saved checkpoint in %s' % (filepath, )
                    except Exception, e:  # todo be more clever here
                        print 'tried to write checkpoint into %s but got error: ' % (
                            filepath, )
                        print e