def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray(bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray( bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] # name of the dataset flickr8k, flickr30k.. word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) completeData = dp.getData('train') misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times #print 'dp.iterSentences', dp.iterSentences('train') misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) #printWordEmbedding(dp.iterSentences('train'),misc['wordtoix']) #print 'type;',type(completeData) # calculate weights of all unique words in vocab weightComputedData = calculateWeights(misc['wordtoix'], misc['ixtoword'], completeData) weightCalculationMethodSec() weightComputedData = getWeightsMethod2() print 'Done:' # delegate the initialization of the model to the Generator class BatchGenerator = GenericBatchGenerator() #decodeGenerator(params) # initialize encoder and decoder weight matrices init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape( 1, bias_init_vector.size) # remove and check print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc, weightComputedData) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics #train_ppl2 = step_struct['stats']['ppl2'] #if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost']) total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost if total_cost > total_cost0 * 2: print 'Aborting, cost seems to be exploding. ' abort = True if (it + 1) == max_iters: top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % ( dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] import csv csvfile = open(os.path.join(params['outdir'],params['generator']+'.csv'),'wb') csvout = csv.writer(csvfile,delimiter=',',quotechar='"') csv_val_file = open(os.path.join(params['outdir'],params['generator']+'_val.csv'),'wb') csv_val_out = csv.writer(csv_val_file,delimiter=',',quotechar='"') for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) csvout.writerow([it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'],train_ppl2, smooth_train_ppl2]) csvfile.flush() if not host=='oliver-Aurora-R4': sys.stdout.flush() # os.system('./update_plots.sh') # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60*1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % (status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it+1) == max_iters if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) cp_pred = {} cp_pred['it'] = it cp_pred['epoch'] = epoch cp_pred['model'] = model cp_pred['params'] = params cp_pred['perplexity'] = val_ppl2 cp_pred['wordtoix'] = misc['wordtoix'] cp_pred['ixtoword'] = misc['ixtoword'] cp_pred['algorithm'] = params['generator'] cp_pred['outdir'] = params['outdir'] if is_last_iter: scores = eval_sentence_predictions.run(cp_pred) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush() omail.send('job finished'+params['generator'],'done') # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_%s_checkpoint_%s_%s_%s_%.2f.p' % (params['generator'],dataset, host, params['fappend'], val_ppl2) filepath = os.path.join(params['outdir'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['wordtoix'] = misc['wordtoix'] checkpoint['ixtoword'] = misc['ixtoword'] checkpoint['algorithm'] = params['generator'] checkpoint['outdir'] = params['outdir'] try: pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % (filepat, ) print e scores = eval_sentence_predictions.run(checkpoint) csv_val_out.writerow([it, max_iters, dt, epoch, val_ppl2, scores[0],scores[1],scores[2],scores[3],scores[4],scores[5],scores[6]]) csv_val_file.flush()
def main(params, split): #import pdb; pdb.set_trace() batch_size = params['batch_size'] dataset = params['dataset'] feature_file = params['feature_file'] class_count_threshold = params['class_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname json_file = 'dataset_mmdb_book_fps_30_samplesize_25_split_%d.json' % ( split) # fetch the data provider dp = getDataProvider(dataset, feature_file, json_file) misc = { } # stores various misc items that need to be passed around the framework # go over all training classes and find the vocabulary we want to use, i.e. the classes that occur # at least class_count_threshold number of times misc['classtoix'], misc[ 'ixtoclass'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), class_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) if params.get('init_model_from', ''): # load checkpoint checkpoint = pickle.load(open(params['init_model_from'], 'rb')) model = checkpoint['model'] # overwrite the model # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoclass']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoclass']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] lastsavedcheckpoint = '' for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) print 'last saved checkpoint in %s' % (lastsavedcheckpoint, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print 'disabling dropout for gradient check...' params['drop_prob_encoder'] = 0 params['drop_prob_decoder'] = 0 solver.gradCheck(batch, model, costfun) print 'done gradcheck, exitting.' sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost['total_cost'] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus['time'] = datetime.datetime.now().isoformat() jstatus['iter'] = (it, max_iters) jstatus['epoch'] = (epoch, max_epochs) jstatus['time_per_batch'] = dt jstatus['smooth_train_ppl2'] = smooth_train_ppl2 jstatus['val_ppl2'] = val_ppl2 # just write the last available one jstatus['train_ppl2'] = train_ppl2 json_worker_status['history'].append(jstatus) status_file = os.path.join( params['worker_status_output_directory'], host + '_status.json') try: json.dump(json_worker_status, open(status_file, 'w')) except Exception, e: # todo be more clever here print 'tried to write worker status into %s but got error:' % ( status_file, ) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set print 'validation perplexity = %f' % (val_ppl2, ) # abort training if the perplexity is no good min_ppl_or_abort = params['min_ppl_or_abort'] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print 'aborting job because validation perplexity %f < %f' % ( val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params[ 'write_checkpoint_ppl_threshold'] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = 'model_checkpoint_%s_%s_%s_alpha_%2.2f_beta_%2.2f_split_%d.p' % ( dataset, host, params['fappend'], params['alpha'], params['beta'], split) filepath = os.path.join( params['checkpoint_output_directory'], filename) checkpoint = {} checkpoint['it'] = it checkpoint['epoch'] = epoch checkpoint['model'] = model checkpoint['params'] = params checkpoint['perplexity'] = val_ppl2 checkpoint['classtoix'] = misc['classtoix'] checkpoint['ixtoclass'] = misc['ixtoclass'] checkpoint['json_file'] = json_file try: if not (params['fappend'] == 'test'): # if it == max_iters - 1 : pickle.dump(checkpoint, open(filepath, "wb")) print 'saved checkpoint in %s' % (filepath, ) lastsavedcheckpoint = filepath except Exception, e: # todo be more clever here print 'tried to write checkpoint into %s but got error: ' % ( filepath, ) print e
def main(params): batch_size = params["batch_size"] dataset = params["dataset"] word_count_threshold = params["word_count_threshold"] do_grad_check = params["do_grad_check"] max_epochs = params["max_epochs"] host = socket.gethostname() # get computer hostname # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc["wordtoix"], misc["ixtoword"], bias_init_vector = preProBuildWordVocab( dp.iterSentences("train"), word_count_threshold ) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc["update"], misc["regularize"] = (init_struct["model"], init_struct["update"], init_struct["regularize"]) # force overwrite here. This is a bit of a hack, not happy about it model["bd"] = bias_init_vector.reshape(1, bias_init_vector.size) print "model init done." print "model has keys: " + ", ".join(model.keys()) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["update"]) print "updating: " + ", ".join("%s [%dx%d]" % (k, model[k].shape[0], model[k].shape[1]) for k in misc["regularize"]) print "number of learnable parameters total: %d" % ( sum(model[k].shape[0] * model[k].shape[1] for k in misc["update"]), ) if params.get("init_model_from", ""): # load checkpoint checkpoint = pickle.load(open(params["init_model_from"], "rb")) model = checkpoint["model"] # overwrite the model print checkpoint["model"] # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize("train", ofwhat="sentences") num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params["eval_period"] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc["ixtoword"]) # initially size of dictionary of confusion val_ppl2 = len(misc["ixtoword"]) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status["params"] = params json_worker_status["history"] = [] for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct["cost"] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct["stats"]["ppl2"] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print "%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)" % ( it, max_iters, dt, epoch, cost["loss_cost"], cost["reg_cost"], train_ppl2, smooth_train_ppl2, ) # perform gradient check if desired, with a bit of a burnin time (10 iterations) if it == 10 and do_grad_check: print "disabling dropout for gradient check..." params["drop_prob_encoder"] = 0 params["drop_prob_decoder"] = 0 solver.gradCheck(batch, model, costfun) print "done gradcheck, exitting." sys.exit() # hmmm. probably should exit here # detect if loss is exploding and kill the job if so total_cost = cost["total_cost"] if it == 0: total_cost0 = total_cost # store this initial cost if total_cost > total_cost0 * 2: print "Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?" abort = True # set the abort flag, we'll break out # logging: write JSON files for visual inspection of the training tnow = time.time() if tnow > last_status_write_time + 60 * 1: # every now and then lets write a report last_status_write_time = tnow jstatus = {} jstatus["time"] = datetime.datetime.now().isoformat() jstatus["iter"] = (it, max_iters) jstatus["epoch"] = (epoch, max_epochs) jstatus["time_per_batch"] = dt jstatus["smooth_train_ppl2"] = smooth_train_ppl2 jstatus["val_ppl2"] = val_ppl2 # just write the last available one jstatus["train_ppl2"] = train_ppl2 json_worker_status["history"].append(jstatus) status_file = os.path.join(params["worker_status_output_directory"], host + "_status.json") try: json.dump(json_worker_status, open(status_file, "w")) except Exception, e: # todo be more clever here print "tried to write worker status into %s but got error:" % (status_file,) print e # perform perplexity evaluation on the validation set and save a model checkpoint if it's good is_last_iter = (it + 1) == max_iters if (((it + 1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: val_ppl2 = eval_split("val", dp, model, params, misc) # perform the evaluation on VAL set print "validation perplexity = %f" % (val_ppl2,) # abort training if the perplexity is no good min_ppl_or_abort = params["min_ppl_or_abort"] if val_ppl2 > min_ppl_or_abort and min_ppl_or_abort > 0: print "aborting job because validation perplexity %f < %f" % (val_ppl2, min_ppl_or_abort) abort = True # abort the job write_checkpoint_ppl_threshold = params["write_checkpoint_ppl_threshold"] if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # if we beat a previous record or if this is the first time # AND we also beat the user-defined threshold or it doesnt exist top_val_ppl2 = val_ppl2 filename = "model_checkpoint_%s_%s_%s_%.2f.p" % (dataset, host, params["fappend"], val_ppl2) filepath = os.path.join(params["checkpoint_output_directory"], filename) checkpoint = {} checkpoint["it"] = it checkpoint["epoch"] = epoch checkpoint["model"] = model checkpoint["params"] = params checkpoint["perplexity"] = val_ppl2 checkpoint["wordtoix"] = misc["wordtoix"] checkpoint["ixtoword"] = misc["ixtoword"] try: pickle.dump(checkpoint, open(filepath, "wb")) print "saved checkpoint in %s" % (filepath,) except Exception, e: # todo be more clever here print "tried to write checkpoint into %s but got error: " % (filepat,) print e