def run_training( cfg ): # set up logging tf.logging.set_verbosity( tf.logging.INFO ) with tf.Graph().as_default() as g: # create ops and placeholders inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=True ) RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = setup_model( inputs, cfg, is_training=False ) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics( inputs, model, cfg ) # execute training start_time = time.time() utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=True ) training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] ) try: # This just returns the imput as output. It is for testing data # input only. for step in xrange( inputs[ 'max_steps' ] ): input_batch, target_batch, data_idx = training_runners['sess'].run( [ model['input_batch'], model['target_batch'], model[ 'data_idxs' ] ] ) if training_runners['coord'].should_stop(): break finally: utils.request_data_loading_end( training_runners ) utils.end_data_loading_and_sess( training_runners ) # else: # Use tf.slim # train_log_dir = os.path.join( cfg['log_dir'], 'slim-train' ) # # When ready to use a model, use the code below # train( model[ 'train_op' ], # train_log_dir, # get_data_prefetch_threads_init_fn( inputs, cfg ), # global_step=model[ 'global_step' ], # number_of_steps=inputs[ 'max_steps' ], # init_fn=model[ 'init_fn' ], # save_summaries_secs=300, # save_interval_secs=600, # saver=model[ 'saver_op' ] ) end_train_time = time.time() - start_time print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60))) print('avg time per epoch: %.3f hrs' % ( (end_train_time/(60*60)) / cfg['num_epochs']) )
def run_extract_representations(args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as g: # create ops and placeholders tf.logging.set_verbosity(tf.logging.INFO) inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False) RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) # set up metrics to evaluate # names_to_values, names_to_updates = setup_metrics( inputs, model, cfg ) # execute training start_time = time.time() utils.print_start_info(cfg, inputs['max_steps'], is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return utils.print_start_info(cfg, inputs['max_steps'], is_training=False) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations filenames = [] representations, data_idx = training_runners['sess'].run( [inputs['input_batch'], inputs['data_idxs']]) filenames.extend(data_idx) if type(representations) == list: representations = representations[0] representations = representations.reshape( (-1, np.prod(cfg['input_dim']))) print('Got first batch representation with size: {0}'.format( representations.shape)) # run the remaining examples for step in range(inputs['max_steps'] - 1): #for step in range( 10 ): if step % 100 == 0: print('Step {0} of {1}'.format(step, inputs['max_steps'] - 1)) # This is just for GAN, for the LEO meeting encoder_output, data_idx = training_runners['sess'].run( [inputs['input_batch'], inputs['data_idxs']]) if type(encoder_output) == list: encoder_output = encoder_output[0] representations = np.append(representations, encoder_output.reshape( (-1, np.prod(cfg['input_dim']))), axis=0) filenames.extend(data_idx) if training_runners['coord'].should_stop(): break print( 'The size of representations is %s while we expect it to run for %d steps with batchsize %d' % (representations.shape, inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time save_path = os.path.join( save_dir, '{task}_{split}_representations.pkl'.format( task='pixels', split=args.data_split)) with open(save_path, 'wb') as f: pickle.dump( { 'file_indexes': filenames, 'representations': representations }, f) copy_to = None if args.out_dir: os.system("sudo cp {fp} {out}/".format(fp=save_path, out=args.out_dir)) copy_to = args.out_dir else: if transfer: os.system( "sudo cp {fp} /home/ubuntu/s3/model_log/representations_transfer/" .format(fp=save_path)) copy_to = '/home/ubuntu/s3/model_log/representations_transfer/' else: os.system( "sudo cp {fp} /home/ubuntu/s3/model_log/representations/" .format(fp=save_path)) copy_to = "/home/ubuntu/s3/model_log/representations/" print('saved representations to {0}'.format(save_path)) print('copied representations to {0}'.format(copy_to)) print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)
def run_extract_losses(args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity(tf.logging.ERROR) stats = Statistics() print_every = int(args.print_every) with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False) #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) #RuntimeDeterminedEnviromentVars.populate_registered_variables() max_steps = get_max_steps(inputs['max_steps'], args.data_split) # pdb.set_trace() # build model (and losses and train_op) model = utils.setup_model(inputs, cfg, is_training=False) loss_names, loss_ops = get_extractable_losses(cfg, model) if 'l1_loss' in loss_names: display_loss = 'l1_loss' elif 'l2_loss' in loss_names: display_loss = 'l2_loss' elif 'xentropy' in loss_names: display_loss = 'xentropy' elif 'metric_loss' in loss_names: display_loss = 'metric_loss' elif 'cycle_loss' in loss_names: display_loss = 'cycle_loss' else: display_loss = 'total_loss' # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics(inputs, model, cfg) # execute training start_time = time.time() utils.print_start_info(cfg, max_steps, is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return print('Attention, model_path is ', cfg['model_path']) model['saver_op'].restore(training_runners['sess'], cfg['model_path']) # var = [v for v in tf.global_variables() if 'decoder' in v.name][0] # print(training_runners[ 'sess' ].run(var)) utils.print_start_info(cfg, max_steps, is_training=False) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations filenames = [] loss_names_to_vals = {name: [] for name in loss_names} results = training_runners['sess'].run([ inputs['data_idxs'], inputs['target_batch'], inputs['mask_batch'], *loss_ops ]) #gs = results[1] data_idx = results[0] losses = results[3:] target_input = results[1] mask_input = results[2] for i, name in enumerate(loss_names): loss_names_to_vals[name].append(losses[i]) filenames.extend(data_idx) print("Step number: {}".format(1), (data_idx)) # print(target_input, target_input.sum()) # return # training_runners['sess'].run([v for v in tf.global_variables() if "transfer/rep_conv_1/weights" in v.name][0]) # run the remaining examples start = time.perf_counter() for step in range(max_steps - 1): results = training_runners['sess'].run([ inputs['data_idxs'], # [v for v in tf.global_variables() if "transfer/rep_conv_1/weights/(weights)" in v.name][0], # model['model'].encoder_endpoints['net1_1_output'], # model['model'].encoder_endpoints['net1_2_output'], *loss_ops ]) data_idx = results[0] # print(data_idx) losses = results[1:] # p, t, m = results[1], results[2], results[3] # losses = results[4:] # print(p.mean(), t) for i, name in enumerate(loss_names): loss_names_to_vals[name].append(losses[i]) filenames.extend(data_idx) stats.push(loss_names_to_vals[display_loss][-1]) # baseline_loss = get_xentropy_loss(p, t, m) # tf_loss = loss_names_to_vals[display_loss][-1] # print('tf {} | ours {}'.format(tf_loss, baseline_loss)) # pdb.set_trace() if step % print_every == 0 and step > 0: print( 'Step {0} of {1}: ({5} loss: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)' .format( step, max_steps - 1, stats.mean(), np.sqrt(stats.variance()), # stats.variance(), (time.perf_counter() - start) / print_every, display_loss)) start = time.perf_counter() if training_runners['coord'].should_stop(): break print( 'The size of losses is %s while we expect it to run for %d steps with batchsize %d' % (len(filenames), inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time if args.out_name: out_name = args.out_name else: out_name = '{task}_{split}_losses.pkl'.format( task=given_task, split=args.data_split) save_path = os.path.join(save_dir, out_name) with open(save_path, 'wb') as f: loss_names_to_vals['file_indexes'] = filenames loss_names_to_vals['global_step'] = 0 pickle.dump(loss_names_to_vals, f) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) os.system("sudo cp {fp} {out}/".format(fp=save_path, out=args.out_dir)) else: if transfer: copy_to = cfg['log_root'] else: copy_to = os.path.join(cfg['log_root'], given_task) os.system("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) # if transfer: # os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/', exist_ok=True) # os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses_transfer/".format(fp=save_path)) # else: # os.makedirs('/home/ubuntu/s3/model_log/losses/', exist_ok=True) # os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses/".format(fp=save_path)) print('saved losses to {0}'.format(save_path)) print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)
def run_extract_losses( args, cfg, save_dir, given_task ): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer_imagenet setup_input_fn = utils.setup_input_transfer_imagenet else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity( tf.logging.ERROR ) stats = Statistics() top5_stats = Statistics() print_every = int(args.print_every) with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn( cfg, is_training=False, use_filename_queue=False ) #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) #RuntimeDeterminedEnviromentVars.populate_registered_variables() max_steps = get_max_steps(inputs[ 'max_steps' ], args.data_split) # build model (and losses and train_op) model = utils.setup_model( inputs, cfg, is_training=False ) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics( inputs, model, cfg ) # execute training start_time = time.time() utils.print_start_info( cfg, max_steps, is_training=False ) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return print('Attention, model_path is ', cfg['model_path']) model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] ) # var = [v for v in tf.global_variables() if 'decoder' in v.name][0] # print(training_runners[ 'sess' ].run(var)) utils.print_start_info( cfg, max_steps, is_training=False ) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False ) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=( training_runners[ 'sess' ], training_runners[ 'coord' ] )) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations filenames = [] accuracies = [] if transfer: accuracy_op = model['model'].decoder.accuracy final_output = model['model'].decoder.final_output else: accuracy_op = model['model'].accuracy final_output = model['model'].final_output results = training_runners['sess'].run( [ inputs[ 'data_idxs' ], model['model'].global_step, accuracy_op ] ) gs = results[1] data_idx = results[0] accuracy = results[2] filenames.extend(data_idx) accuracies.append(accuracy) print("Step number: {}".format(gs)) # print(loss_names_to_vals, data_idx) # return # run the remaining examples start = time.perf_counter() for step in range( max_steps - 1 ): results = training_runners['sess'].run( [ inputs[ 'data_idxs' ], final_output, inputs['target_batch'], accuracy_op ] ) data_idx = results[0] accuracy = results[-1] logits = results[1] gt = results[2] sorted_top5 = np.argsort(logits[0])[::-1][:5] sorted_gt = np.argsort(gt[0])[::-1][0] top5 = 0. if sorted_gt in sorted_top5: top5 = 1. filenames.extend(data_idx) accuracies.append(accuracy) stats.push(accuracy) top5_stats.push(top5) if step % print_every == 0 and step > 0: print( 'Step {0} of {1}: ({5}: {2:.3f} || Top 5: {3:.3f} :: ({4:.2f} secs/step)'.format( step, max_steps - 1, stats.mean(), top5_stats.mean(), # stats.variance(), (time.perf_counter() - start) / print_every, 'accuracy' )) start = time.perf_counter() if training_runners['coord'].should_stop(): break os.system("sudo touch /home/ubuntu/s3/imagenet_accuracy/{}_{}_{}.txt".format( given_task, int(stats.mean() * 1000) / 10., int(top5_stats.mean() * 1000) / 10.)) print('The size of losses is %s while we expect it to run for %d steps with batchsize %d' % (len(filenames), inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time if args.out_name: out_name = args.out_name else: out_name = '{task}_{split}_imagenet_accuracy.pkl'.format(task=given_task, split=args.data_split) save_path = os.path.join( save_dir, out_name ) val_accuracy = {} with open( save_path, 'wb' ) as f: val_accuracy['file_indexes'] = filenames val_accuracy['global_step'] = gs val_accuracy['accuracy'] = accuracies pickle.dump( val_accuracy, f ) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) os.system("sudo cp {fp} {out}/".format(fp=save_path, out=args.out_dir)) else: if transfer: copy_to = cfg['log_root'] else: copy_to = os.path.join(cfg['log_root'], given_task) os.system("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) # if transfer: # os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/', exist_ok=True) # os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses_transfer/".format(fp=save_path)) # else: # os.makedirs('/home/ubuntu/s3/model_log/losses/', exist_ok=True) # os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses/".format(fp=save_path)) print( 'saved losses to {0}'.format( save_path )) print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60))) finally: utils.request_data_loading_end( training_runners ) utils.end_data_loading_and_sess( training_runners )
def run_rand_baseline( args, cfg, given_task ): # set up logging tf.logging.set_verbosity( tf.logging.INFO ) with tf.Graph().as_default() as g: # create ops and placeholders tf.logging.set_verbosity( tf.logging.INFO ) inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=False ) RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = utils.setup_model( inputs, cfg, is_training=False ) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics( inputs, model, cfg ) # execute training start_time = time.time() utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False ) #training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] ) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=( training_runners[ 'sess' ], training_runners[ 'coord' ] )) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations targets = training_runners['sess'].run( inputs[ 'target_batch' ] ) # run the remaining examples for step in range( inputs[ 'max_steps' ] - 1 ): #for step in range( 10 ): if step % 100 == 0: print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 )) target = training_runners['sess'].run( inputs[ 'target_batch' ] ) targets = np.append( targets, target, axis=0) if training_runners['coord'].should_stop(): break rand_idx = [random.randint(0, targets.shape[0] - 1) for i in range(targets.shape[0])] rand_target = [targets[i] for i in rand_idx] rand_target = np.vstack(rand_target) counter = 0 sum = 0 for step in range( inputs[ 'max_steps' ] - 1 ): #for step in range( 10 ): if step % 100 == 0: print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 )) tar = targets[step*cfg['batch_size']:(step+1)*cfg['batch_size']] rand = rand_target[step*cfg['batch_size']:(step+1)*cfg['batch_size']] losses = training_runners['sess'].run( model['model'].losses, feed_dict={ inputs['target_batch']: tar, model['model'].final_output:rand}) sum += losses[0] counter += 1 if training_runners['coord'].should_stop(): break print(sum) print(counter) print('random_baseline has loss: {loss}'.format(loss=sum/counter)) end_train_time = time.time() - start_time finally: utils.request_data_loading_end( training_runners ) utils.end_data_loading_and_sess( training_runners )
def run_training(cfg): # set up logging tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as g: # create ops and placeholders inputs = utils.setup_input(cfg, is_training=False, use_filename_queue=True) RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = setup_model(inputs, cfg, is_training=False) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics(inputs, model, cfg) # execute training start_time = time.time() utils.print_start_info(cfg, inputs['max_steps'], is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } if cfg['model_path'] is None: print('Please specify a checkpoint directory') return cfg['randomize'] = False model['saver_op'].restore(training_runners['sess'], cfg['model_path']) utils.print_start_info(cfg, inputs['max_steps'], is_training=is_training) data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=True) training_runners['threads'] = data_prefetch_init_fn( training_runners['sess'], training_runners['coord']) representations, input_batch, target_batch, data_idx = training_runners[ 'sess'].run([ model['model'].encoder_output, inputs['input_batch'], inputs['target_batch'], inputs['data_idxs'], inputs['mask_batch'] ]) print('Got first batch representation with size:%s' % (representations.shape)) for step in xrange(inputs['max_steps'] - 1): encoder_output, input_batch, target_batch, data_idx = training_runners[ 'sess'].run([ model['model'].encoder_output, inputs['input_batch'], inputs['target_batch'], inputs['data_idxs'], inputs['mask_batch'] ]) representations = np.append(representations, encoder_output, axis=0) if training_runners['coord'].should_stop(): break print( 'The size of representations is %s while we expect it to run for %d steps with batchsize %d' % (representations.shape, inputs['max_steps'], cfg['batch_size'])) utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners) end_train_time = time.time() - start_time print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) print('avg time per epoch: %.3f hrs' % ((end_train_time / (60 * 60)) / cfg['num_epochs']))
def run_training(cfg, cfg_dir): # set up logging tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as g: # create ops and placeholders inputs = utils.setup_input_transfer(cfg, is_training=True) RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = utils.setup_model(inputs, cfg, is_training=True) # execute training start_time = time.time() utils.print_start_info(cfg, inputs['max_steps'], is_training=True) if cfg['model_type'] == 'empty': # Can't use tf slim because not trainable variables training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=True) training_runners['threads'] = data_prefetch_init_fn( training_runners['sess'], training_runners['coord']) try: # This just returns the imput as output. It is for testing data # input only. for step in xrange(inputs['max_steps']): input_batch, target_batch, data_idx = training_runners[ 'sess'].run([ model['input_batch'], model['target_batch'], model['data_idxs'] ]) if training_runners['coord'].should_stop(): break finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners) else: # Use tf.slim train_log_dir = os.path.join(cfg['log_dir'], 'slim-train') permanent_checkpoint_dir = os.path.join(cfg['log_dir'], 'checkpoints') session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True #max_to_keep = cfg['num_epochs'] * 2 max_to_keep = 10 if 'max_ckpts_to_keep' in cfg: max_to_keep = cfg['max_ckpts_to_keep'] # When ready to use a model, use the code below train( model['train_op'], train_log_dir, utils.get_data_prefetch_threads_init_fn_transfer( inputs, cfg, is_training=True), train_step_fn=model['train_step_fn'], train_step_kwargs=model['train_step_kwargs'], global_step=model['global_step'], number_of_steps=inputs['max_steps'], number_of_epochs=cfg['num_epochs'], init_fn=model['init_fn'], save_checkpoint_every=max(inputs['max_steps'] // (max_to_keep), 500), cfg_dir=cfg_dir, #RuntimeDeterminedEnviromentVars.steps_per_epoch, permanent_checkpoint_dir=permanent_checkpoint_dir, save_summaries_secs=cfg['summary_save_every_secs'], save_interval_secs=cfg['checkpoint_save_every_secs'], saver=model['saver_op'], return_accuracy='return_accuracy' in cfg and cfg['return_accuracy'], session_config=session_config) end_train_time = time.time() - start_time print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) print('avg time per epoch: %.3f hrs' % ((end_train_time / (60 * 60)) / cfg['num_epochs']))
def run_training(cfg, cfg_dir, args): if args.stat_type == "mean": statistic = MeanMeter(cfg) elif args.stat_type == 'median': statistic = MedianMeter(cfg) elif args.stat_type == 'marginal': statistic = DiscreteDistributionMeter(cfg, args.not_one_hot) elif args.stat_type == 'dense_marginal': statistic = DenseDiscreteDistributionMeter(cfg) elif args.stat_type == 'moments': statistic = MomentsMeter(cfg) else: raise NotImplementedError("No average defined for type: {}".format( args.stat_type)) # set up logging tf.logging.set_verbosity(tf.logging.ERROR) with tf.Graph().as_default() as g: # create ops and placeholders inputs = utils.setup_input(cfg, is_training=False) RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg) RuntimeDeterminedEnviromentVars.populate_registered_variables() # execute training start_time = time.time() max_steps = get_max_steps(inputs['max_steps'], args.data_split) utils.print_start_info(cfg, max_steps, is_training=False) data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False) training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } prefetch_threads = threading.Thread( target=data_prefetch_threads_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() target_batch = training_runners['sess'].run(inputs['target_batch']) # training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] ) try: # This just returns the imput as output. It is for testing data # input only. start_time = time.time() batch_time = time.time() k = int(args.print_every) for step in range(max_steps): target_batch, mask_batch = training_runners['sess'].run( [inputs['target_batch'], inputs['mask_batch']]) target_batch = map_to_img(target_batch.mean(axis=0), cfg) if len(mask_batch.shape) > 1: mask_batch = mask_batch.mean(axis=0) else: mask_batch = 1 statistic.update(target_batch, mask_batch) if (step + 1) % k == 0: print('Step %d/%d: %.2f s/step ' % (step + 1, max_steps, (time.time() - batch_time) / k)) batch_time = time.time() # print(statistic.get()) # break if training_runners['coord'].should_stop(): break end_train_time = time.time() - start_time print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) print('avg time per epoch: %.3f hrs' % ((end_train_time / (60 * 60)) / cfg['num_epochs'])) if args.stat_type == 'moments': save_moments(statistic, cfg, args) else: save_data(statistic, cfg, args) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)
def run_extract_losses(args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer if given_task == 'pixels': get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer_imagenet setup_input_fn = utils.setup_input_transfer_imagenet else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity(tf.logging.ERROR) stats = Statistics() print_every = int(args.print_every) with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn(cfg, is_training=True, use_filename_queue=False) #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) #RuntimeDeterminedEnviromentVars.populate_registered_variables() max_steps = get_max_steps(inputs['max_steps'], args.data_split) # build model (and losses and train_op) model = utils.setup_model(inputs, cfg, is_training=True) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics(inputs, model, cfg) train_step_fn = model['train_step_fn'] # execute training start_time = time.time() utils.print_start_info(cfg, max_steps, is_training=True) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return print('Attention, model_path is ', cfg['model_path']) restore_ckpt = not args.from_scratch if restore_ckpt: non_encoder_var = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES) adams = [] for v in tuple(non_encoder_var): if 'Adam' in v.name: non_encoder_var.remove(v) adams.append(v) continue if 'finetune_encoder_imagenet' in cfg: for x in model['model'].encoder_vars: if v.name == x.name: non_encoder_var.remove(v) if not args.metric_only: saver_for_transfer = tf.train.Saver(non_encoder_var) #training_runners['sess'].run(saver_for_transfer) saver_for_transfer.restore(training_runners['sess'], cfg['model_path']) else: #saver_for_transfer = tf.train.Saver(non_encoder_var) training_runners['sess'].run( tf.variables_initializer(non_encoder_var)) training_runners['sess'].run(tf.variables_initializer(adams)) print('Loading Source Encoder:...') if 'finetune_encoder_imagenet' in cfg: model['init_fn'](training_runners['sess']) print('Starting Training:..') else: init_op = tf.global_variables_initializer() training_runners['sess'].run(init_op) assign_op = model['global_step'].assign(0) training_runners['sess'].run(assign_op) # var = [v for v in tf.global_variables() if 'decoder' in v.name][0] # print(training_runners[ 'sess' ].run(var)) utils.print_start_info(cfg, max_steps, is_training=True) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=True, use_filename_queue=False) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations start = time.perf_counter() saver = tf.train.Saver() save_ckpt_name = 'places' if args.from_scratch: save_ckpt_name = 'places_scratch_{}_{}'.format( args.layers, args.data_used) if args.train_encoder: save_ckpt_name = 'places_encoder' for step in range(max_steps // 2 - 1): #for step in range(10): total_loss, should_stop = train_step_fn( training_runners['sess'], model['train_op'], model['global_step'], train_step_kwargs=model['train_step_kwargs']) # print(data_idx) # print(p.mean(), t) stats.push(total_loss) if step % print_every == 0 and step > 0: print( 'Step {0} of {1}: ({5}: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)' .format( step, max_steps - 1, stats.mean(), np.sqrt(stats.variance()), # stats.variance(), (time.perf_counter() - start) / print_every, 'Total_loss')) start = time.perf_counter() if step % 3000 == 2999: saver.save( training_runners['sess'], os.path.join(cfg['log_root'], given_task, '{}_{}'.format(save_ckpt_name, step))) if training_runners['coord'].should_stop(): break #print('Heressss') saver.save( training_runners['sess'], os.path.join(cfg['log_root'], given_task, save_ckpt_name)) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)
def run_extract_losses_5_steps(args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity(tf.logging.ERROR) stats = Statistics() with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False) #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) #RuntimeDeterminedEnviromentVars.populate_registered_variables() max_steps = get_max_steps(inputs['max_steps'], args.data_split) # build model (and losses and train_op) model = utils.setup_model(inputs, cfg, is_training=False) loss_names, loss_ops = get_extractable_losses(cfg, model) if 'l1_loss' in loss_names: display_loss = 'l1_loss' elif 'l2_loss' in loss_names: display_loss = 'l2_loss' elif 'xentropy' in loss_names: display_loss = 'xentropy' elif 'metric_loss' in loss_names: display_loss = 'metric_loss' elif 'cycle_loss' in loss_names: display_loss = 'cycle_loss' else: display_loss = 'total_loss' # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics(inputs, model, cfg) # execute training start_time = time.time() utils.print_start_info(cfg, max_steps, is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } if cfg['model_path'] is None: print('Please specify a checkpoint directory') return print('Attention, model_path is ', cfg['model_path']) model['saver_op'].restore(training_runners['sess'], cfg['model_path']) utils.print_start_info(cfg, max_steps, is_training=False) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False) prefetch_threads = threading.Thread(target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations # results = training_runners['sess'].run( [ *loss_ops ] ) # losses = results[0] x = 0 for step in range(3): results = training_runners['sess'].run([*loss_ops]) x = x + results[0] if training_runners['coord'].should_stop(): break tf.reset_default_graph() utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners) return x / 2.
def run_extract_losses(avg_img, args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn stats = Statistics() # set up logging tf.logging.set_verbosity(tf.logging.ERROR) with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False) RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) # model = utils.setup_model( inputs, cfg, is_training=False ) loss_names = [avg_img_to_loss_type(args.avg_type, given_task) ] # Keep format the same as extract_losses.py loss_fn = get_loss_op(loss_names[0]) # execute training start_time = time.time() max_steps = get_max_steps(inputs['max_steps'], args.data_split) utils.print_start_info(cfg, max_steps, is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: utils.print_start_info(cfg, max_steps, is_training=False) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False) #training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] ) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() # run one example so that we can calculate some statistics about the representations filenames = [] loss_names_to_vals = {name: [] for name in loss_names} start = time.perf_counter() print_every = int(args.print_every) # run the remaining examples for step in range(max_steps): data_idx, target, mask = training_runners['sess'].run([ inputs['data_idxs'], inputs['target_batch'], inputs['mask_batch'] ]) loss = loss_fn(avg_img, target, mask) # print(loss) assert np.isfinite(loss) and loss >= 0.0 loss_names_to_vals[loss_names[0]].append(loss) filenames.extend(data_idx) stats.push(loss) if step % print_every == 0 and step > 0: print( 'Step {0} of {1}: (Mean {5}: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)' .format(step, max_steps - 1, stats.mean(), np.sqrt(stats.variance()), (time.perf_counter() - start) / print_every, loss_names[0])) start = time.perf_counter() if training_runners['coord'].should_stop(): break print( 'The size of losses is %s while we expect it to run for %d steps with batchsize %d' % (len(filenames), inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time if args.out_name: out_name = args.out_name else: if args.data_split == "val": split_name = "train" if args.data_split == "test": split_name = "val" else: raise ValueError( "Cannot adequately name output for data split {}". format(args.data_split)) out_name = '{avg_type}__{task}_{split}_losses.pkl'.format( task=given_task, split=split_name, avg_type="marginal" if args.avg_type == 'dense_marginal' else args.avg_type) save_path = os.path.join(save_dir, out_name) with open(save_path, 'wb') as f: loss_names_to_vals['file_indexes'] = filenames loss_names_to_vals['global_step'] = 0 if 'dense_xentropy_loss' in loss_names_to_vals: loss_names_to_vals['xentropy_loss'] = loss_names_to_vals[ 'dense_xentropy_loss'] del loss_names_to_vals['dense_xentropy_loss'] pickle.dump(loss_names_to_vals, f) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) os.system("sudo mv {fp} {out}/".format(fp=save_path, out=args.out_dir)) else: if transfer: os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/', exist_ok=True) os.system( "sudo mv {fp} /home/ubuntu/s3/model_log/losses_transfer/" .format(fp=save_path)) else: os.makedirs('/home/ubuntu/s3/model_log/losses/', exist_ok=True) os.system("sudo mv {fp} /home/ubuntu/s3/model_log/losses/". format(fp=save_path)) print('saved losses to {0}'.format(save_path)) print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)
def run_extract_representations( args, cfg ): # set up logging tf.logging.set_verbosity( tf.logging.INFO ) with tf.Graph().as_default() as g: cfg['randomize'] = False cfg['num_epochs'] = 1 # cfg['num_read_threads'] = 5 # cfg['batch_size']=2 #if cfg['model_path'] is None: # cfg['model_path'] = tf.train.latest_checkpoint( os.path.join( args.cfg_dir, "logs/slim-train/" ) ) cfg['model_path'] = os.path.join( args.cfg_dir, "logs/slim-train/model.ckpt-59690") # create ops and placeholders tf.logging.set_verbosity( tf.logging.INFO ) inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=True ) RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = utils.setup_model( inputs, cfg, is_training=False ) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics( inputs, model, cfg ) # execute training start_time = time.time() utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] ) utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=True ) training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] ) # run one example so that we can calculate some statistics about the representations filenames = [] representations, data_idx = training_runners['sess'].run( [ model['model'].encoder_output, inputs[ 'data_idxs' ] ] ) filenames += [ inputs[ 'filepaths_list'][ i ] for i in data_idx ] print( 'Got first batch representation with size: {0}'.format( representations.shape ) ) # run the remaining examples for step in xrange( inputs[ 'max_steps' ] - 1 ): if step % 100 == 0: print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 )) encoder_output, data_idx = training_runners['sess'].run( [ model['model'].encoder_output, inputs[ 'data_idxs' ] ] ) representations = np.append(representations, encoder_output, axis=0) filenames += [ inputs[ 'filepaths_list'][ i ] for i in data_idx ] if training_runners['coord'].should_stop(): break print('The size of representations is %s while we expect it to run for %d steps with batchsize %d' % (representations.shape, inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time save_path = os.path.join( args.cfg_dir, '../representations.pkl' ) with open( save_path, 'wb' ) as f: pickle.dump( { 'filenames': filenames, 'representations': representations }, f ) print( 'saved representations to {0}'.format( save_path )) print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60))) print('avg time per epoch: %.3f hrs' % ( (end_train_time/(60*60)) / cfg['num_epochs']) ) finally: utils.request_data_loading_end( training_runners ) utils.end_data_loading_and_sess( training_runners )
def run_extract_representations( args, cfg, file_to_process): setup_input_fn = utils.setup_input # set up logging tf.logging.set_verbosity( tf.logging.INFO ) with tf.Graph().as_default() as g: # create ops and placeholders tf.logging.set_verbosity( tf.logging.INFO ) inputs = {} inputs['input_batch'] = tf.placeholder( tf.float32, shape=[1,224,224,3], name='input_placeholder') inputs['target_batch'] = tf.placeholder( tf.float32, shape=[1,1000], name='target_placeholder' ) inputs['mask_batch'] = tf.placeholder( tf.float32, shape=[1], name='mask_placeholder' ) inputs['data_idxs'] = tf.placeholder( tf.int32, shape=[1], name='data_idx_placeholder') inputs['num_samples_epoch'] = len(file_to_process) inputs['max_steps'] = len(file_to_process) RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) RuntimeDeterminedEnviromentVars.populate_registered_variables() # build model (and losses and train_op) model = utils.setup_model( inputs, cfg, is_training=False ) m = model['model'] # execute training utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False ) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return to_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) for v in tuple(to_restore): if 'global_step' in v.name: to_restore.remove(v) saver_for_kd = tf.train.Saver(to_restore) saver_for_kd.restore( training_runners[ 'sess' ], cfg[ 'model_path' ] ) #model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] ) for step, filename in enumerate(file_to_process): start_time = time.time() if step % 100 == 0: print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 )) m,p,v = filename.decode('UTF-8').split('/') print(filename) img_name = '/home/ubuntu/s3/{}/rgb/point_{}_view_{}_domain_rgb.png'.format(m, p, v) sfm_dir = 's3://taskonomy-unpacked-oregon/{}/softmax_1000'.format(m) os.system('sudo mkdir -p /home/ubuntu/s3/{}/softmax_1000/'.format(m)) os.system('mkdir -p /home/ubuntu/temp/{}/'.format(m)) npy_name = 'point_{}_view_{}.npy'.format(p, v) if os.path.isfile('/home/ubuntu/s3/{}/softmax_1000/{}'.format(m, npy_name)): continue if not os.path.isfile(img_name): continue img = skimage.io.imread(img_name, as_grey=False) img = resize_rescale_imagenet(img, new_dims=(224,224)) img = np.reshape(img, (1,224,224,3)) feed_dict = {inputs['input_batch'] : img} predicted = training_runners['sess'].run( model['model'].encoder_output, feed_dict=feed_dict ) # maxs = np.amax(predicted, axis=-1) # softmax = np.exp(predicted - np.expand_dims(maxs, axis=-1)) # sums = np.sum(softmax, axis=-1) # softmax = softmax / np.expand_dims(sums, -1) # print(softmax) # pdb.set_trace() local_npy = os.path.join('/home/ubuntu/temp/{}'.format(m), npy_name) with open(local_npy, 'wb') as fp: np.save(fp, predicted) os.system('aws s3 mv {} {}/'.format(local_npy, sfm_dir)) if training_runners['coord'].should_stop(): break end_train_time = time.time() - start_time print('time to extract %.3f ' % (end_train_time)) finally: utils.request_data_loading_end( training_runners ) utils.end_data_loading_and_sess( training_runners )
def run_extract_losses(args, cfg, save_dir, given_task): transfer = (cfg['model_type'] == architectures.TransferNet) if transfer: get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer setup_input_fn = utils.setup_input_transfer else: setup_input_fn = utils.setup_input get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn # set up logging tf.logging.set_verbosity(tf.logging.ERROR) stats = Statistics() print_every = int(args.print_every) with tf.Graph().as_default() as g: # create ops and placeholders inputs = setup_input_fn(cfg, is_training=False, use_filename_queue=False) #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg ) #RuntimeDeterminedEnviromentVars.populate_registered_variables() max_steps = get_max_steps(inputs['max_steps'], args.data_split) # pdb.set_trace() # build model (and losses and train_op) model = utils.setup_model(inputs, cfg, is_training=False) # set up metrics to evaluate names_to_values, names_to_updates = setup_metrics(inputs, model, cfg) # execute training start_time = time.time() utils.print_start_info(cfg, max_steps, is_training=False) # start session and restore model training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() } try: if cfg['model_path'] is None: print('Please specify a checkpoint directory') return print('Attention, model_path is ', cfg['model_path']) if given_task == 'rgb2depth_sota': non_encoder_var = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES) for v in tuple(non_encoder_var): if 'global_step' in v.name: non_encoder_var.remove(v) if 'Adam' in v.name: non_encoder_var.remove(v) saver_sota = tf.train.Saver(non_encoder_var) saver_sota.restore(training_runners['sess'], cfg['model_path']) else: model['saver_op'].restore(training_runners['sess'], cfg['model_path']) # var = [v for v in tf.global_variables() if 'decoder' in v.name][0] # print(training_runners[ 'sess' ].run(var)) utils.print_start_info(cfg, max_steps, is_training=False) data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False) prefetch_threads = threading.Thread( target=data_prefetch_init_fn, args=(training_runners['sess'], training_runners['coord'])) prefetch_threads.start() m = model['model'] # run one example so that we can calculate some statistics about the representations filenames = [] losses = [] results = training_runners['sess'].run([ inputs['data_idxs'], m.target_images, m.masks, m.decoder_output ]) data_idx = results[0] target = results[1] mask = results[2] predicted = results[3] import pdb #pdb.set_trace() normalized_loss = get_norm_loss(target, mask, predicted) filenames.extend(data_idx) losses.append(normalized_loss) print("Step number: {}".format(0), (data_idx)) # print(target_input, target_input.sum()) # return # training_runners['sess'].run([v for v in tf.global_variables() if "transfer/rep_conv_1/weights" in v.name][0]) # run the remaining examples start = time.perf_counter() max_steps = 3000 for step in range(max_steps - 1): results = training_runners['sess'].run([ inputs['data_idxs'], m.target_images, m.masks, m.decoder_output ]) data_idx = results[0] target = results[1] mask = results[2] predicted = results[3] # p, t, m = results[1], results[2], results[3] # losses = results[4:] normalized_loss = get_norm_loss(target, mask, predicted) # print(p.mean(), t) filenames.extend(data_idx) losses.append(normalized_loss) stats.push(normalized_loss) if step % print_every == 0 and step > 0: print( 'Step {0} of {1}: ({5} loss: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)' .format( step, max_steps - 1, stats.mean(), np.sqrt(stats.variance()), # stats.variance(), (time.perf_counter() - start) / print_every, 'norm_depth')) start = time.perf_counter() if training_runners['coord'].should_stop(): break print( 'The size of losses is %s while we expect it to run for %d steps with batchsize %d' % (len(filenames), inputs['max_steps'], cfg['batch_size'])) end_train_time = time.time() - start_time if args.out_name: out_name = args.out_name else: out_name = '{task}_{split}_losses_normed.pkl'.format( task=given_task, split=args.data_split) save_path = os.path.join(save_dir, out_name) loss_names_to_vals = {} with open(save_path, 'wb') as f: loss_names_to_vals['file_indexes'] = filenames loss_names_to_vals['global_step'] = 0 loss_names_to_vals['l1_loss'] = losses pickle.dump(loss_names_to_vals, f) if args.out_dir: os.makedirs(args.out_dir, exist_ok=True) os.system("sudo cp {fp} {out}/".format(fp=save_path, out=args.out_dir)) else: if transfer: copy_to = cfg['log_root'] else: copy_to = os.path.join(cfg['log_root'], given_task) os.system("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to)) print('saved losses to {0}'.format(save_path)) print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time / (60 * 60))) finally: utils.request_data_loading_end(training_runners) utils.end_data_loading_and_sess(training_runners)