Exemple #1
0
def run_training( cfg ):
    # set up logging
    tf.logging.set_verbosity( tf.logging.INFO )

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=True )
        RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        RuntimeDeterminedEnviromentVars.populate_registered_variables()

        # build model (and losses and train_op)
        model = setup_model( inputs, cfg, is_training=False )

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics( inputs, model, cfg )

        # execute training 
        start_time = time.time()
        utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

        training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() }
        data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=True )
        training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] )
        try:
            # This just returns the imput as output. It is for testing data
            #  input only. 
            for step in xrange( inputs[ 'max_steps' ] ):
                input_batch, target_batch, data_idx = training_runners['sess'].run( [ 
                        model['input_batch'],  model['target_batch'], model[ 'data_idxs' ] ] )

                if training_runners['coord'].should_stop():
                    break
        finally:
            utils.request_data_loading_end( training_runners )
            utils.end_data_loading_and_sess( training_runners )
        # else: # Use tf.slim
        #     train_log_dir = os.path.join( cfg['log_dir'], 'slim-train' )

        #     # When ready to use a model, use the code below
        #     train(  model[ 'train_op' ],
        #             train_log_dir,
        #             get_data_prefetch_threads_init_fn( inputs, cfg ), 
        #             global_step=model[ 'global_step' ],
        #             number_of_steps=inputs[ 'max_steps' ],
        #             init_fn=model[ 'init_fn' ],
        #             save_summaries_secs=300,
        #             save_interval_secs=600,
        #             saver=model[ 'saver_op' ] ) 

        end_train_time = time.time() - start_time
        print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60)))
        print('avg time per epoch: %.3f hrs' % ( (end_train_time/(60*60)) / cfg['num_epochs']) )
def run_extract_representations(args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity(tf.logging.INFO)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        tf.logging.set_verbosity(tf.logging.INFO)
        inputs = setup_input_fn(cfg,
                                is_training=False,
                                use_filename_queue=False)
        RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
        RuntimeDeterminedEnviromentVars.populate_registered_variables()

        # build model (and losses and train_op)

        # set up metrics to evaluate
        # names_to_values, names_to_updates = setup_metrics( inputs, model, cfg )

        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, inputs['max_steps'], is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return

            utils.print_start_info(cfg, inputs['max_steps'], is_training=False)
            data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=False, use_filename_queue=False)
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=(training_runners['sess'], training_runners['coord']))
            prefetch_threads.start()

            # run one example so that we can calculate some statistics about the representations
            filenames = []
            representations, data_idx = training_runners['sess'].run(
                [inputs['input_batch'], inputs['data_idxs']])

            filenames.extend(data_idx)
            if type(representations) == list:
                representations = representations[0]
            representations = representations.reshape(
                (-1, np.prod(cfg['input_dim'])))
            print('Got first batch representation with size: {0}'.format(
                representations.shape))

            # run the remaining examples
            for step in range(inputs['max_steps'] - 1):
                #for step in range( 10 ):
                if step % 100 == 0:
                    print('Step {0} of {1}'.format(step,
                                                   inputs['max_steps'] - 1))

                # This is just for GAN, for the LEO meeting
                encoder_output, data_idx = training_runners['sess'].run(
                    [inputs['input_batch'], inputs['data_idxs']])
                if type(encoder_output) == list:
                    encoder_output = encoder_output[0]
                representations = np.append(representations,
                                            encoder_output.reshape(
                                                (-1,
                                                 np.prod(cfg['input_dim']))),
                                            axis=0)
                filenames.extend(data_idx)

                if training_runners['coord'].should_stop():
                    break

            print(
                'The size of representations is %s while we expect it to run for %d steps with batchsize %d'
                % (representations.shape, inputs['max_steps'],
                   cfg['batch_size']))

            end_train_time = time.time() - start_time
            save_path = os.path.join(
                save_dir, '{task}_{split}_representations.pkl'.format(
                    task='pixels', split=args.data_split))

            with open(save_path, 'wb') as f:
                pickle.dump(
                    {
                        'file_indexes': filenames,
                        'representations': representations
                    }, f)

            copy_to = None
            if args.out_dir:
                os.system("sudo cp {fp} {out}/".format(fp=save_path,
                                                       out=args.out_dir))
                copy_to = args.out_dir
            else:
                if transfer:
                    os.system(
                        "sudo cp {fp} /home/ubuntu/s3/model_log/representations_transfer/"
                        .format(fp=save_path))
                    copy_to = '/home/ubuntu/s3/model_log/representations_transfer/'
                else:
                    os.system(
                        "sudo cp {fp} /home/ubuntu/s3/model_log/representations/"
                        .format(fp=save_path))
                    copy_to = "/home/ubuntu/s3/model_log/representations/"

            print('saved representations to {0}'.format(save_path))
            print('copied representations to {0}'.format(copy_to))
            print('time to extract %d epochs: %.3f hrs' %
                  (cfg['num_epochs'], end_train_time / (60 * 60)))
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)
Exemple #3
0
def run_extract_losses(args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)
    stats = Statistics()
    print_every = int(args.print_every)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn(cfg,
                                is_training=False,
                                use_filename_queue=False)
        #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        #RuntimeDeterminedEnviromentVars.populate_registered_variables()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)
        # pdb.set_trace()
        # build model (and losses and train_op)
        model = utils.setup_model(inputs, cfg, is_training=False)
        loss_names, loss_ops = get_extractable_losses(cfg, model)
        if 'l1_loss' in loss_names:
            display_loss = 'l1_loss'
        elif 'l2_loss' in loss_names:
            display_loss = 'l2_loss'
        elif 'xentropy' in loss_names:
            display_loss = 'xentropy'
        elif 'metric_loss' in loss_names:
            display_loss = 'metric_loss'
        elif 'cycle_loss' in loss_names:
            display_loss = 'cycle_loss'
        else:
            display_loss = 'total_loss'

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics(inputs, model, cfg)

        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, max_steps, is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return
            print('Attention, model_path is ', cfg['model_path'])
            model['saver_op'].restore(training_runners['sess'],
                                      cfg['model_path'])

            # var = [v for v in tf.global_variables() if 'decoder' in v.name][0]
            # print(training_runners[ 'sess' ].run(var))

            utils.print_start_info(cfg, max_steps, is_training=False)
            data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=False, use_filename_queue=False)
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=(training_runners['sess'], training_runners['coord']))
            prefetch_threads.start()

            # run one example so that we can calculate some statistics about the representations
            filenames = []
            loss_names_to_vals = {name: [] for name in loss_names}
            results = training_runners['sess'].run([
                inputs['data_idxs'], inputs['target_batch'],
                inputs['mask_batch'], *loss_ops
            ])
            #gs = results[1]
            data_idx = results[0]
            losses = results[3:]
            target_input = results[1]
            mask_input = results[2]
            for i, name in enumerate(loss_names):
                loss_names_to_vals[name].append(losses[i])
            filenames.extend(data_idx)
            print("Step number: {}".format(1), (data_idx))
            # print(target_input, target_input.sum())
            # return
            # training_runners['sess'].run([v for v in tf.global_variables() if "transfer/rep_conv_1/weights" in v.name][0])
            # run the remaining examples
            start = time.perf_counter()
            for step in range(max_steps - 1):
                results = training_runners['sess'].run([
                    inputs['data_idxs'],
                    # [v for v in tf.global_variables() if "transfer/rep_conv_1/weights/(weights)" in v.name][0],
                    # model['model'].encoder_endpoints['net1_1_output'],
                    # model['model'].encoder_endpoints['net1_2_output'],
                    *loss_ops
                ])
                data_idx = results[0]
                # print(data_idx)
                losses = results[1:]
                # p, t, m = results[1], results[2], results[3]
                # losses = results[4:]

                # print(p.mean(), t)
                for i, name in enumerate(loss_names):
                    loss_names_to_vals[name].append(losses[i])
                filenames.extend(data_idx)
                stats.push(loss_names_to_vals[display_loss][-1])

                # baseline_loss = get_xentropy_loss(p, t, m)
                # tf_loss = loss_names_to_vals[display_loss][-1]
                # print('tf {} | ours {}'.format(tf_loss, baseline_loss))
                # pdb.set_trace()

                if step % print_every == 0 and step > 0:
                    print(
                        'Step {0} of {1}: ({5} loss: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)'
                        .format(
                            step,
                            max_steps - 1,
                            stats.mean(),
                            np.sqrt(stats.variance()),
                            # stats.variance(),
                            (time.perf_counter() - start) / print_every,
                            display_loss))
                    start = time.perf_counter()

                if training_runners['coord'].should_stop():
                    break

            print(
                'The size of losses is %s while we expect it to run for %d steps with batchsize %d'
                % (len(filenames), inputs['max_steps'], cfg['batch_size']))

            end_train_time = time.time() - start_time
            if args.out_name:
                out_name = args.out_name
            else:
                out_name = '{task}_{split}_losses.pkl'.format(
                    task=given_task, split=args.data_split)
            save_path = os.path.join(save_dir, out_name)

            with open(save_path, 'wb') as f:
                loss_names_to_vals['file_indexes'] = filenames
                loss_names_to_vals['global_step'] = 0
                pickle.dump(loss_names_to_vals, f)

            if args.out_dir:
                os.makedirs(args.out_dir, exist_ok=True)
                os.system("sudo cp {fp} {out}/".format(fp=save_path,
                                                       out=args.out_dir))
            else:
                if transfer:
                    copy_to = cfg['log_root']
                else:
                    copy_to = os.path.join(cfg['log_root'], given_task)
                os.system("sudo mv {fp} {dst}/".format(fp=save_path,
                                                       dst=copy_to))
                print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to))
                # if transfer:
                #     os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/', exist_ok=True)
                #     os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses_transfer/".format(fp=save_path))
                # else:
                #     os.makedirs('/home/ubuntu/s3/model_log/losses/', exist_ok=True)
                #     os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses/".format(fp=save_path))

            print('saved losses to {0}'.format(save_path))
            print('time to extract %d epochs: %.3f hrs' %
                  (cfg['num_epochs'], end_train_time / (60 * 60)))
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)
def run_extract_losses( args, cfg, save_dir, given_task ):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer_imagenet
        setup_input_fn = utils.setup_input_transfer_imagenet
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity( tf.logging.ERROR )
    stats = Statistics()
    top5_stats = Statistics()
    print_every = int(args.print_every)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn( cfg, is_training=False, use_filename_queue=False )
        #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        #RuntimeDeterminedEnviromentVars.populate_registered_variables()
        max_steps = get_max_steps(inputs[ 'max_steps' ], args.data_split)

        # build model (and losses and train_op)
        model = utils.setup_model( inputs, cfg, is_training=False )

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics( inputs, model, cfg )

        # execute training 
        start_time = time.time()
        utils.print_start_info( cfg, max_steps, is_training=False )

        # start session and restore model
        training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return	
            print('Attention, model_path is ', cfg['model_path']) 
            model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] )

            # var = [v for v in tf.global_variables() if 'decoder' in v.name][0]
            # print(training_runners[ 'sess' ].run(var))

            utils.print_start_info( cfg, max_steps, is_training=False )
            data_prefetch_init_fn = get_data_prefetch_threads_init_fn( inputs, cfg, 
                is_training=False, use_filename_queue=False )
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=( training_runners[ 'sess' ], training_runners[ 'coord' ] ))
            prefetch_threads.start()
            
            # run one example so that we can calculate some statistics about the representations
            filenames = []
            accuracies = []
            if transfer:
                accuracy_op = model['model'].decoder.accuracy
                final_output = model['model'].decoder.final_output 
            else:
                accuracy_op = model['model'].accuracy
                final_output = model['model'].final_output 
            results = training_runners['sess'].run( [ 
                    inputs[ 'data_idxs' ], model['model'].global_step,
                    accuracy_op ] )       
            gs = results[1] 
            data_idx = results[0]
            accuracy = results[2]
            filenames.extend(data_idx)
            accuracies.append(accuracy)
            print("Step number: {}".format(gs))
            # print(loss_names_to_vals, data_idx)
            # return

            # run the remaining examples
            start = time.perf_counter()
            for step in range( max_steps - 1 ):
                results = training_runners['sess'].run( [
                        inputs[ 'data_idxs' ], 
                        final_output,
                        inputs['target_batch'],
                        accuracy_op ] )    
                data_idx = results[0]
                accuracy = results[-1]
                logits = results[1]
                gt = results[2]
                sorted_top5 = np.argsort(logits[0])[::-1][:5]
                sorted_gt = np.argsort(gt[0])[::-1][0]
                top5 = 0.
                if sorted_gt in sorted_top5:
                    top5 = 1.
                filenames.extend(data_idx)
                accuracies.append(accuracy)
                stats.push(accuracy)
                top5_stats.push(top5)
                if step % print_every == 0 and step > 0: 
                    print( 'Step {0} of {1}: ({5}: {2:.3f} || Top 5: {3:.3f} :: ({4:.2f} secs/step)'.format( 
                        step, max_steps - 1,
                        stats.mean(), 
                        top5_stats.mean(),
                        # stats.variance(),
                        (time.perf_counter() - start) / print_every,
                        'accuracy'
                        ))
                    start = time.perf_counter()

                if training_runners['coord'].should_stop():
                    break

            os.system("sudo touch /home/ubuntu/s3/imagenet_accuracy/{}_{}_{}.txt".format(
                given_task, int(stats.mean() * 1000) / 10., int(top5_stats.mean() * 1000) / 10.))
            print('The size of losses is %s while we expect it to run for %d steps with batchsize %d' % (len(filenames), inputs['max_steps'], cfg['batch_size']))

            end_train_time = time.time() - start_time
            if args.out_name:
                out_name = args.out_name
            else:
                out_name = '{task}_{split}_imagenet_accuracy.pkl'.format(task=given_task, split=args.data_split)
            save_path = os.path.join( save_dir, out_name )
            
            val_accuracy = {}
            with open( save_path, 'wb' ) as f:
                val_accuracy['file_indexes'] = filenames
                val_accuracy['global_step'] = gs
                val_accuracy['accuracy'] = accuracies
                pickle.dump( val_accuracy, f )
            
            if args.out_dir:
                os.makedirs(args.out_dir, exist_ok=True)
                os.system("sudo cp {fp} {out}/".format(fp=save_path, out=args.out_dir))
            else:
                if transfer:
                    copy_to = cfg['log_root']
                else:
                    copy_to = os.path.join(cfg['log_root'], given_task)
                os.system("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to))
                print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to))
                # if transfer:
                #     os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/', exist_ok=True)
                #     os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses_transfer/".format(fp=save_path))
                # else:
                #     os.makedirs('/home/ubuntu/s3/model_log/losses/', exist_ok=True)
                #     os.system("sudo cp {fp} /home/ubuntu/s3/model_log/losses/".format(fp=save_path))

            print( 'saved losses to {0}'.format( save_path ))
            print('time to extract %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60)))
        finally:
            utils.request_data_loading_end( training_runners )
            utils.end_data_loading_and_sess( training_runners )
def run_rand_baseline( args, cfg, given_task ):
    # set up logging
    tf.logging.set_verbosity( tf.logging.INFO )

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        tf.logging.set_verbosity( tf.logging.INFO )
        inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=False )
        RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        RuntimeDeterminedEnviromentVars.populate_registered_variables()
        
        # build model (and losses and train_op)
        model = utils.setup_model( inputs, cfg, is_training=False )

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics( inputs, model, cfg )

        # execute training 
        start_time = time.time()
        utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

        # start session and restore model
        training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() }
        try:
            
            utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

            data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=False )
            #training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] )
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=( training_runners[ 'sess' ], training_runners[ 'coord' ] ))
            prefetch_threads.start()
            
            # run one example so that we can calculate some statistics about the representations
            targets = training_runners['sess'].run( inputs[ 'target_batch' ] )         
       
            # run the remaining examples
            for step in range( inputs[ 'max_steps' ] - 1 ):
            #for step in range( 10 ):
                if step % 100 == 0: 
                    print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 ))
               
                target = training_runners['sess'].run( inputs[ 'target_batch' ] )  
                targets = np.append( targets, target, axis=0)

                if training_runners['coord'].should_stop():
                    break

            rand_idx = [random.randint(0, targets.shape[0] - 1) for i in range(targets.shape[0])] 
            rand_target = [targets[i] for i in rand_idx]
            rand_target = np.vstack(rand_target)

            counter = 0
            sum = 0
            for step in range( inputs[ 'max_steps' ] - 1 ):
            #for step in range( 10 ):
                if step % 100 == 0: 
                    print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 ))
               
                tar = targets[step*cfg['batch_size']:(step+1)*cfg['batch_size']]
                rand = rand_target[step*cfg['batch_size']:(step+1)*cfg['batch_size']]

                losses = training_runners['sess'].run( model['model'].losses, feed_dict={
                    inputs['target_batch']: tar, model['model'].final_output:rand})
                sum += losses[0]
                counter += 1
                
                if training_runners['coord'].should_stop():
                    break

            print(sum)
            print(counter)
            print('random_baseline has loss: {loss}'.format(loss=sum/counter))
            end_train_time = time.time() - start_time
            
        finally:
            utils.request_data_loading_end( training_runners )
            utils.end_data_loading_and_sess( training_runners )
Exemple #6
0
def run_training(cfg):
    # set up logging
    tf.logging.set_verbosity(tf.logging.INFO)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = utils.setup_input(cfg,
                                   is_training=False,
                                   use_filename_queue=True)
        RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
        RuntimeDeterminedEnviromentVars.populate_registered_variables()

        # build model (and losses and train_op)
        model = setup_model(inputs, cfg, is_training=False)

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics(inputs, model, cfg)

        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, inputs['max_steps'], is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        if cfg['model_path'] is None:
            print('Please specify a checkpoint directory')
            return
        cfg['randomize'] = False
        model['saver_op'].restore(training_runners['sess'], cfg['model_path'])

        utils.print_start_info(cfg,
                               inputs['max_steps'],
                               is_training=is_training)

        data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn(
            inputs, cfg, is_training=False, use_filename_queue=True)
        training_runners['threads'] = data_prefetch_init_fn(
            training_runners['sess'], training_runners['coord'])

        representations, input_batch, target_batch, data_idx = training_runners[
            'sess'].run([
                model['model'].encoder_output, inputs['input_batch'],
                inputs['target_batch'], inputs['data_idxs'],
                inputs['mask_batch']
            ])

        print('Got first batch representation with size:%s' %
              (representations.shape))
        for step in xrange(inputs['max_steps'] - 1):
            encoder_output, input_batch, target_batch, data_idx = training_runners[
                'sess'].run([
                    model['model'].encoder_output, inputs['input_batch'],
                    inputs['target_batch'], inputs['data_idxs'],
                    inputs['mask_batch']
                ])
            representations = np.append(representations,
                                        encoder_output,
                                        axis=0)

            if training_runners['coord'].should_stop():
                break

        print(
            'The size of representations is %s while we expect it to run for %d steps with batchsize %d'
            % (representations.shape, inputs['max_steps'], cfg['batch_size']))

        utils.request_data_loading_end(training_runners)
        utils.end_data_loading_and_sess(training_runners)

        end_train_time = time.time() - start_time
        print('time to train %d epochs: %.3f hrs' %
              (cfg['num_epochs'], end_train_time / (60 * 60)))
        print('avg time per epoch: %.3f hrs' %
              ((end_train_time / (60 * 60)) / cfg['num_epochs']))
Exemple #7
0
def run_training(cfg, cfg_dir):
    # set up logging
    tf.logging.set_verbosity(tf.logging.INFO)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = utils.setup_input_transfer(cfg, is_training=True)
        RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
        RuntimeDeterminedEnviromentVars.populate_registered_variables()
        # build model (and losses and train_op)
        model = utils.setup_model(inputs, cfg, is_training=True)
        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, inputs['max_steps'], is_training=True)
        if cfg['model_type'] == 'empty':  # Can't use tf slim because not trainable variables
            training_runners = {
                'sess': tf.Session(),
                'coord': tf.train.Coordinator()
            }
            data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=True)
            training_runners['threads'] = data_prefetch_init_fn(
                training_runners['sess'], training_runners['coord'])
            try:
                # This just returns the imput as output. It is for testing data
                #  input only.
                for step in xrange(inputs['max_steps']):
                    input_batch, target_batch, data_idx = training_runners[
                        'sess'].run([
                            model['input_batch'], model['target_batch'],
                            model['data_idxs']
                        ])

                    if training_runners['coord'].should_stop():
                        break
            finally:
                utils.request_data_loading_end(training_runners)
                utils.end_data_loading_and_sess(training_runners)
        else:  # Use tf.slim
            train_log_dir = os.path.join(cfg['log_dir'], 'slim-train')
            permanent_checkpoint_dir = os.path.join(cfg['log_dir'],
                                                    'checkpoints')

            session_config = tf.ConfigProto()
            session_config.gpu_options.allow_growth = True
            #max_to_keep = cfg['num_epochs'] * 2
            max_to_keep = 10
            if 'max_ckpts_to_keep' in cfg:
                max_to_keep = cfg['max_ckpts_to_keep']
            # When ready to use a model, use the code below
            train(
                model['train_op'],
                train_log_dir,
                utils.get_data_prefetch_threads_init_fn_transfer(
                    inputs, cfg, is_training=True),
                train_step_fn=model['train_step_fn'],
                train_step_kwargs=model['train_step_kwargs'],
                global_step=model['global_step'],
                number_of_steps=inputs['max_steps'],
                number_of_epochs=cfg['num_epochs'],
                init_fn=model['init_fn'],
                save_checkpoint_every=max(inputs['max_steps'] // (max_to_keep),
                                          500),
                cfg_dir=cfg_dir,
                #RuntimeDeterminedEnviromentVars.steps_per_epoch,
                permanent_checkpoint_dir=permanent_checkpoint_dir,
                save_summaries_secs=cfg['summary_save_every_secs'],
                save_interval_secs=cfg['checkpoint_save_every_secs'],
                saver=model['saver_op'],
                return_accuracy='return_accuracy' in cfg
                and cfg['return_accuracy'],
                session_config=session_config)

        end_train_time = time.time() - start_time
        print('time to train %d epochs: %.3f hrs' %
              (cfg['num_epochs'], end_train_time / (60 * 60)))
        print('avg time per epoch: %.3f hrs' %
              ((end_train_time / (60 * 60)) / cfg['num_epochs']))
Exemple #8
0
def run_training(cfg, cfg_dir, args):
    if args.stat_type == "mean":
        statistic = MeanMeter(cfg)
    elif args.stat_type == 'median':
        statistic = MedianMeter(cfg)
    elif args.stat_type == 'marginal':
        statistic = DiscreteDistributionMeter(cfg, args.not_one_hot)
    elif args.stat_type == 'dense_marginal':
        statistic = DenseDiscreteDistributionMeter(cfg)
    elif args.stat_type == 'moments':
        statistic = MomentsMeter(cfg)
    else:
        raise NotImplementedError("No average defined for type: {}".format(
            args.stat_type))

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = utils.setup_input(cfg, is_training=False)
        RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
        RuntimeDeterminedEnviromentVars.populate_registered_variables()

        # execute training
        start_time = time.time()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)
        utils.print_start_info(cfg, max_steps, is_training=False)
        data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn(
            inputs, cfg, is_training=False)
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }

        prefetch_threads = threading.Thread(
            target=data_prefetch_threads_init_fn,
            args=(training_runners['sess'], training_runners['coord']))
        prefetch_threads.start()

        target_batch = training_runners['sess'].run(inputs['target_batch'])
        # training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] )
        try:
            # This just returns the imput as output. It is for testing data
            #  input only.
            start_time = time.time()
            batch_time = time.time()
            k = int(args.print_every)
            for step in range(max_steps):
                target_batch, mask_batch = training_runners['sess'].run(
                    [inputs['target_batch'], inputs['mask_batch']])
                target_batch = map_to_img(target_batch.mean(axis=0), cfg)
                if len(mask_batch.shape) > 1:
                    mask_batch = mask_batch.mean(axis=0)
                else:
                    mask_batch = 1

                statistic.update(target_batch, mask_batch)
                if (step + 1) % k == 0:
                    print('Step %d/%d: %.2f s/step ' %
                          (step + 1, max_steps,
                           (time.time() - batch_time) / k))
                    batch_time = time.time()
                    # print(statistic.get())
                    # break
                if training_runners['coord'].should_stop():
                    break

            end_train_time = time.time() - start_time
            print('time to train %d epochs: %.3f hrs' %
                  (cfg['num_epochs'], end_train_time / (60 * 60)))
            print('avg time per epoch: %.3f hrs' %
                  ((end_train_time / (60 * 60)) / cfg['num_epochs']))
            if args.stat_type == 'moments':
                save_moments(statistic, cfg, args)
            else:
                save_data(statistic, cfg, args)
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)
def run_extract_losses(args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
        if given_task == 'pixels':
            get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer_imagenet
            setup_input_fn = utils.setup_input_transfer_imagenet
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)
    stats = Statistics()
    print_every = int(args.print_every)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn(cfg,
                                is_training=True,
                                use_filename_queue=False)
        #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        #RuntimeDeterminedEnviromentVars.populate_registered_variables()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)
        # build model (and losses and train_op)
        model = utils.setup_model(inputs, cfg, is_training=True)

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics(inputs, model, cfg)
        train_step_fn = model['train_step_fn']
        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, max_steps, is_training=True)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return
            print('Attention, model_path is ', cfg['model_path'])
            restore_ckpt = not args.from_scratch
            if restore_ckpt:
                non_encoder_var = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES)
                adams = []
                for v in tuple(non_encoder_var):
                    if 'Adam' in v.name:
                        non_encoder_var.remove(v)
                        adams.append(v)
                        continue
                    if 'finetune_encoder_imagenet' in cfg:
                        for x in model['model'].encoder_vars:
                            if v.name == x.name:
                                non_encoder_var.remove(v)
                if not args.metric_only:
                    saver_for_transfer = tf.train.Saver(non_encoder_var)
                    #training_runners['sess'].run(saver_for_transfer)
                    saver_for_transfer.restore(training_runners['sess'],
                                               cfg['model_path'])
                else:
                    #saver_for_transfer = tf.train.Saver(non_encoder_var)
                    training_runners['sess'].run(
                        tf.variables_initializer(non_encoder_var))
                training_runners['sess'].run(tf.variables_initializer(adams))
                print('Loading Source Encoder:...')
                if 'finetune_encoder_imagenet' in cfg:
                    model['init_fn'](training_runners['sess'])
                print('Starting Training:..')
            else:
                init_op = tf.global_variables_initializer()
                training_runners['sess'].run(init_op)
            assign_op = model['global_step'].assign(0)
            training_runners['sess'].run(assign_op)
            # var = [v for v in tf.global_variables() if 'decoder' in v.name][0]
            # print(training_runners[ 'sess' ].run(var))

            utils.print_start_info(cfg, max_steps, is_training=True)
            data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=True, use_filename_queue=False)
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=(training_runners['sess'], training_runners['coord']))
            prefetch_threads.start()

            # run one example so that we can calculate some statistics about the representations
            start = time.perf_counter()
            saver = tf.train.Saver()
            save_ckpt_name = 'places'
            if args.from_scratch:
                save_ckpt_name = 'places_scratch_{}_{}'.format(
                    args.layers, args.data_used)
            if args.train_encoder:
                save_ckpt_name = 'places_encoder'
            for step in range(max_steps // 2 - 1):
                #for step in range(10):
                total_loss, should_stop = train_step_fn(
                    training_runners['sess'],
                    model['train_op'],
                    model['global_step'],
                    train_step_kwargs=model['train_step_kwargs'])
                # print(data_idx)
                # print(p.mean(), t)
                stats.push(total_loss)

                if step % print_every == 0 and step > 0:
                    print(
                        'Step {0} of {1}: ({5}: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)'
                        .format(
                            step,
                            max_steps - 1,
                            stats.mean(),
                            np.sqrt(stats.variance()),
                            # stats.variance(),
                            (time.perf_counter() - start) / print_every,
                            'Total_loss'))
                    start = time.perf_counter()
                if step % 3000 == 2999:
                    saver.save(
                        training_runners['sess'],
                        os.path.join(cfg['log_root'], given_task,
                                     '{}_{}'.format(save_ckpt_name, step)))

                if training_runners['coord'].should_stop():
                    break

            #print('Heressss')
            saver.save(
                training_runners['sess'],
                os.path.join(cfg['log_root'], given_task, save_ckpt_name))
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)
Exemple #10
0
def run_extract_losses_5_steps(args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)
    stats = Statistics()

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn(cfg,
                                is_training=False,
                                use_filename_queue=False)
        #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        #RuntimeDeterminedEnviromentVars.populate_registered_variables()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)

        # build model (and losses and train_op)
        model = utils.setup_model(inputs, cfg, is_training=False)
        loss_names, loss_ops = get_extractable_losses(cfg, model)
        if 'l1_loss' in loss_names:
            display_loss = 'l1_loss'
        elif 'l2_loss' in loss_names:
            display_loss = 'l2_loss'
        elif 'xentropy' in loss_names:
            display_loss = 'xentropy'
        elif 'metric_loss' in loss_names:
            display_loss = 'metric_loss'
        elif 'cycle_loss' in loss_names:
            display_loss = 'cycle_loss'
        else:
            display_loss = 'total_loss'

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics(inputs, model, cfg)

        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, max_steps, is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        if cfg['model_path'] is None:
            print('Please specify a checkpoint directory')
            return
        print('Attention, model_path is ', cfg['model_path'])
        model['saver_op'].restore(training_runners['sess'], cfg['model_path'])

        utils.print_start_info(cfg, max_steps, is_training=False)
        data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
            inputs, cfg, is_training=False, use_filename_queue=False)
        prefetch_threads = threading.Thread(target=data_prefetch_init_fn,
                                            args=(training_runners['sess'],
                                                  training_runners['coord']))
        prefetch_threads.start()

        # run one example so that we can calculate some statistics about the representations
        # results = training_runners['sess'].run( [ *loss_ops ] )
        # losses = results[0]
        x = 0
        for step in range(3):
            results = training_runners['sess'].run([*loss_ops])
            x = x + results[0]
            if training_runners['coord'].should_stop():
                break

    tf.reset_default_graph()
    utils.request_data_loading_end(training_runners)
    utils.end_data_loading_and_sess(training_runners)
    return x / 2.
def run_extract_losses(avg_img, args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    stats = Statistics()

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn(cfg,
                                is_training=False,
                                use_filename_queue=False)
        RuntimeDeterminedEnviromentVars.load_dynamic_variables(inputs, cfg)
        RuntimeDeterminedEnviromentVars.populate_registered_variables()

        # build model (and losses and train_op)
        # model = utils.setup_model( inputs, cfg, is_training=False )
        loss_names = [avg_img_to_loss_type(args.avg_type, given_task)
                      ]  # Keep format the same as extract_losses.py
        loss_fn = get_loss_op(loss_names[0])

        # execute training
        start_time = time.time()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)
        utils.print_start_info(cfg, max_steps, is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        try:
            utils.print_start_info(cfg, max_steps, is_training=False)

            data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=False, use_filename_queue=False)
            #training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] )
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=(training_runners['sess'], training_runners['coord']))
            prefetch_threads.start()

            # run one example so that we can calculate some statistics about the representations
            filenames = []
            loss_names_to_vals = {name: [] for name in loss_names}
            start = time.perf_counter()

            print_every = int(args.print_every)
            # run the remaining examples
            for step in range(max_steps):
                data_idx, target, mask = training_runners['sess'].run([
                    inputs['data_idxs'], inputs['target_batch'],
                    inputs['mask_batch']
                ])
                loss = loss_fn(avg_img, target, mask)
                # print(loss)
                assert np.isfinite(loss) and loss >= 0.0
                loss_names_to_vals[loss_names[0]].append(loss)
                filenames.extend(data_idx)
                stats.push(loss)

                if step % print_every == 0 and step > 0:
                    print(
                        'Step {0} of {1}: (Mean {5}: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)'
                        .format(step, max_steps - 1, stats.mean(),
                                np.sqrt(stats.variance()),
                                (time.perf_counter() - start) / print_every,
                                loss_names[0]))
                    start = time.perf_counter()
                if training_runners['coord'].should_stop():
                    break

            print(
                'The size of losses is %s while we expect it to run for %d steps with batchsize %d'
                % (len(filenames), inputs['max_steps'], cfg['batch_size']))

            end_train_time = time.time() - start_time
            if args.out_name:
                out_name = args.out_name
            else:
                if args.data_split == "val":
                    split_name = "train"
                if args.data_split == "test":
                    split_name = "val"
                else:
                    raise ValueError(
                        "Cannot adequately name output for data split {}".
                        format(args.data_split))
                out_name = '{avg_type}__{task}_{split}_losses.pkl'.format(
                    task=given_task,
                    split=split_name,
                    avg_type="marginal"
                    if args.avg_type == 'dense_marginal' else args.avg_type)
            save_path = os.path.join(save_dir, out_name)

            with open(save_path, 'wb') as f:
                loss_names_to_vals['file_indexes'] = filenames
                loss_names_to_vals['global_step'] = 0
                if 'dense_xentropy_loss' in loss_names_to_vals:
                    loss_names_to_vals['xentropy_loss'] = loss_names_to_vals[
                        'dense_xentropy_loss']
                    del loss_names_to_vals['dense_xentropy_loss']
                pickle.dump(loss_names_to_vals, f)

            if args.out_dir:
                os.makedirs(args.out_dir, exist_ok=True)
                os.system("sudo mv {fp} {out}/".format(fp=save_path,
                                                       out=args.out_dir))
            else:
                if transfer:
                    os.makedirs('/home/ubuntu/s3/model_log/losses_transfer/',
                                exist_ok=True)
                    os.system(
                        "sudo mv {fp} /home/ubuntu/s3/model_log/losses_transfer/"
                        .format(fp=save_path))
                else:
                    os.makedirs('/home/ubuntu/s3/model_log/losses/',
                                exist_ok=True)
                    os.system("sudo mv {fp} /home/ubuntu/s3/model_log/losses/".
                              format(fp=save_path))

            print('saved losses to {0}'.format(save_path))
            print('time to extract %d epochs: %.3f hrs' %
                  (cfg['num_epochs'], end_train_time / (60 * 60)))
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)
Exemple #12
0
def run_extract_representations( args, cfg ):
    # set up logging
    tf.logging.set_verbosity( tf.logging.INFO )

    with tf.Graph().as_default() as g:
        cfg['randomize'] = False
        cfg['num_epochs'] = 1
        # cfg['num_read_threads'] = 5
        # cfg['batch_size']=2
        #if cfg['model_path'] is None:
        #    cfg['model_path'] = tf.train.latest_checkpoint( os.path.join( args.cfg_dir, "logs/slim-train/" ) )
        cfg['model_path'] = os.path.join( args.cfg_dir, "logs/slim-train/model.ckpt-59690")
        # create ops and placeholders
        tf.logging.set_verbosity( tf.logging.INFO )
        inputs = utils.setup_input( cfg, is_training=False, use_filename_queue=True )
        RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        RuntimeDeterminedEnviromentVars.populate_registered_variables()
        
        # build model (and losses and train_op)
        model = utils.setup_model( inputs, cfg, is_training=False )

        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics( inputs, model, cfg )

        # execute training 
        start_time = time.time()
        utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

        # start session and restore model
        training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return	
            
            model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] )
            
            utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

            data_prefetch_init_fn = utils.get_data_prefetch_threads_init_fn( inputs, cfg, is_training=False, use_filename_queue=True )
            training_runners[ 'threads' ] = data_prefetch_init_fn( training_runners[ 'sess' ], training_runners[ 'coord' ] )
            
            # run one example so that we can calculate some statistics about the representations
            filenames = []
            representations, data_idx = training_runners['sess'].run( [ 
                    model['model'].encoder_output, inputs[ 'data_idxs' ] ] )        
            filenames += [ inputs[ 'filepaths_list'][ i ] for i in data_idx ]
            print( 'Got first batch representation with size: {0}'.format( representations.shape ) )

            # run the remaining examples
            for step in xrange( inputs[ 'max_steps' ] - 1 ):
                if step % 100 == 0: 
                    print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 ))
                encoder_output, data_idx = training_runners['sess'].run( [
                        model['model'].encoder_output, inputs[ 'data_idxs' ] ] )        
                representations = np.append(representations, encoder_output, axis=0)
                filenames += [ inputs[ 'filepaths_list'][ i ] for i in data_idx ]

                if training_runners['coord'].should_stop():
                    break

            print('The size of representations is %s while we expect it to run for %d steps with batchsize %d' % (representations.shape, inputs['max_steps'], cfg['batch_size']))

            end_train_time = time.time() - start_time
            save_path = os.path.join( args.cfg_dir, '../representations.pkl' )
            with open( save_path, 'wb' ) as f:
                pickle.dump( { 'filenames': filenames, 'representations': representations }, f )
            print( 'saved representations to {0}'.format( save_path ))
            print('time to train %d epochs: %.3f hrs' % (cfg['num_epochs'], end_train_time/(60*60)))
            print('avg time per epoch: %.3f hrs' % ( (end_train_time/(60*60)) / cfg['num_epochs']) )
        finally:
            utils.request_data_loading_end( training_runners )
            utils.end_data_loading_and_sess( training_runners )
def run_extract_representations( args, cfg, file_to_process):
    setup_input_fn = utils.setup_input
    # set up logging
    tf.logging.set_verbosity( tf.logging.INFO )

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        tf.logging.set_verbosity( tf.logging.INFO )
        inputs = {}
        inputs['input_batch'] = tf.placeholder( tf.float32, shape=[1,224,224,3], name='input_placeholder')
        inputs['target_batch'] = tf.placeholder( tf.float32, shape=[1,1000], name='target_placeholder' )
        inputs['mask_batch'] = tf.placeholder( tf.float32, shape=[1], name='mask_placeholder' )
        inputs['data_idxs'] = tf.placeholder( tf.int32, shape=[1], name='data_idx_placeholder')
        inputs['num_samples_epoch'] = len(file_to_process) 
        inputs['max_steps'] = len(file_to_process) 
        
        RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        RuntimeDeterminedEnviromentVars.populate_registered_variables()
        
        # build model (and losses and train_op)
        model = utils.setup_model( inputs, cfg, is_training=False )
        m = model['model']

        # execute training 
        utils.print_start_info( cfg, inputs[ 'max_steps' ], is_training=False )

        # start session and restore model
        training_runners = { 'sess': tf.Session(), 'coord': tf.train.Coordinator() }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return	
            
            to_restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)  
            for v in tuple(to_restore):     
                if 'global_step' in v.name:
                    to_restore.remove(v)
                            
            saver_for_kd = tf.train.Saver(to_restore)
            saver_for_kd.restore( training_runners[ 'sess' ], cfg[ 'model_path' ] )
            #model[ 'saver_op' ].restore( training_runners[ 'sess' ], cfg[ 'model_path' ] )

            for step, filename in enumerate(file_to_process):
                start_time = time.time()
                if step % 100 == 0: 
                    print( 'Step {0} of {1}'.format( step, inputs[ 'max_steps' ] - 1 ))
                m,p,v = filename.decode('UTF-8').split('/')    
                print(filename)
                img_name = '/home/ubuntu/s3/{}/rgb/point_{}_view_{}_domain_rgb.png'.format(m, p, v)
                sfm_dir = 's3://taskonomy-unpacked-oregon/{}/softmax_1000'.format(m)
                os.system('sudo mkdir -p /home/ubuntu/s3/{}/softmax_1000/'.format(m))
                os.system('mkdir -p /home/ubuntu/temp/{}/'.format(m))
                npy_name = 'point_{}_view_{}.npy'.format(p, v)
                if os.path.isfile('/home/ubuntu/s3/{}/softmax_1000/{}'.format(m, npy_name)):
                    continue
                if not os.path.isfile(img_name):
                    continue
                img = skimage.io.imread(img_name, as_grey=False)
                img = resize_rescale_imagenet(img, new_dims=(224,224))
                img = np.reshape(img, (1,224,224,3))
                feed_dict = {inputs['input_batch'] : img}
                predicted = training_runners['sess'].run( model['model'].encoder_output, feed_dict=feed_dict )
                # maxs = np.amax(predicted, axis=-1)
                # softmax = np.exp(predicted - np.expand_dims(maxs, axis=-1))
                # sums = np.sum(softmax, axis=-1)
                # softmax = softmax / np.expand_dims(sums, -1)
                # print(softmax)
                # pdb.set_trace()
                local_npy = os.path.join('/home/ubuntu/temp/{}'.format(m), npy_name) 
                with open(local_npy, 'wb') as fp:
                    np.save(fp, predicted)
                os.system('aws s3 mv {} {}/'.format(local_npy, sfm_dir))
                if training_runners['coord'].should_stop():
                    break
                end_train_time = time.time() - start_time
                print('time to extract  %.3f ' % (end_train_time))

        finally:
            utils.request_data_loading_end( training_runners )
            utils.end_data_loading_and_sess( training_runners )
def run_extract_losses(args, cfg, save_dir, given_task):
    transfer = (cfg['model_type'] == architectures.TransferNet)
    if transfer:
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn_transfer
        setup_input_fn = utils.setup_input_transfer
    else:
        setup_input_fn = utils.setup_input
        get_data_prefetch_threads_init_fn = utils.get_data_prefetch_threads_init_fn

    # set up logging
    tf.logging.set_verbosity(tf.logging.ERROR)
    stats = Statistics()
    print_every = int(args.print_every)

    with tf.Graph().as_default() as g:
        # create ops and placeholders
        inputs = setup_input_fn(cfg,
                                is_training=False,
                                use_filename_queue=False)
        #RuntimeDeterminedEnviromentVars.load_dynamic_variables( inputs, cfg )
        #RuntimeDeterminedEnviromentVars.populate_registered_variables()
        max_steps = get_max_steps(inputs['max_steps'], args.data_split)
        # pdb.set_trace()
        # build model (and losses and train_op)
        model = utils.setup_model(inputs, cfg, is_training=False)
        # set up metrics to evaluate
        names_to_values, names_to_updates = setup_metrics(inputs, model, cfg)

        # execute training
        start_time = time.time()
        utils.print_start_info(cfg, max_steps, is_training=False)

        # start session and restore model
        training_runners = {
            'sess': tf.Session(),
            'coord': tf.train.Coordinator()
        }
        try:
            if cfg['model_path'] is None:
                print('Please specify a checkpoint directory')
                return
            print('Attention, model_path is ', cfg['model_path'])

            if given_task == 'rgb2depth_sota':
                non_encoder_var = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES)
                for v in tuple(non_encoder_var):
                    if 'global_step' in v.name:
                        non_encoder_var.remove(v)
                    if 'Adam' in v.name:
                        non_encoder_var.remove(v)
                saver_sota = tf.train.Saver(non_encoder_var)
                saver_sota.restore(training_runners['sess'], cfg['model_path'])
            else:
                model['saver_op'].restore(training_runners['sess'],
                                          cfg['model_path'])

            # var = [v for v in tf.global_variables() if 'decoder' in v.name][0]
            # print(training_runners[ 'sess' ].run(var))

            utils.print_start_info(cfg, max_steps, is_training=False)
            data_prefetch_init_fn = get_data_prefetch_threads_init_fn(
                inputs, cfg, is_training=False, use_filename_queue=False)
            prefetch_threads = threading.Thread(
                target=data_prefetch_init_fn,
                args=(training_runners['sess'], training_runners['coord']))
            prefetch_threads.start()
            m = model['model']

            # run one example so that we can calculate some statistics about the representations
            filenames = []
            losses = []
            results = training_runners['sess'].run([
                inputs['data_idxs'], m.target_images, m.masks, m.decoder_output
            ])
            data_idx = results[0]
            target = results[1]
            mask = results[2]
            predicted = results[3]

            import pdb
            #pdb.set_trace()

            normalized_loss = get_norm_loss(target, mask, predicted)

            filenames.extend(data_idx)
            losses.append(normalized_loss)
            print("Step number: {}".format(0), (data_idx))
            # print(target_input, target_input.sum())
            # return
            # training_runners['sess'].run([v for v in tf.global_variables() if "transfer/rep_conv_1/weights" in v.name][0])
            # run the remaining examples
            start = time.perf_counter()
            max_steps = 3000
            for step in range(max_steps - 1):
                results = training_runners['sess'].run([
                    inputs['data_idxs'], m.target_images, m.masks,
                    m.decoder_output
                ])
                data_idx = results[0]
                target = results[1]
                mask = results[2]
                predicted = results[3]
                # p, t, m = results[1], results[2], results[3]
                # losses = results[4:]
                normalized_loss = get_norm_loss(target, mask, predicted)
                # print(p.mean(), t)
                filenames.extend(data_idx)
                losses.append(normalized_loss)

                stats.push(normalized_loss)

                if step % print_every == 0 and step > 0:
                    print(
                        'Step {0} of {1}: ({5} loss: {2:.3f} || stddev: {3:.3f} :: ({4:.2f} secs/step)'
                        .format(
                            step,
                            max_steps - 1,
                            stats.mean(),
                            np.sqrt(stats.variance()),
                            # stats.variance(),
                            (time.perf_counter() - start) / print_every,
                            'norm_depth'))
                    start = time.perf_counter()

                if training_runners['coord'].should_stop():
                    break

            print(
                'The size of losses is %s while we expect it to run for %d steps with batchsize %d'
                % (len(filenames), inputs['max_steps'], cfg['batch_size']))

            end_train_time = time.time() - start_time
            if args.out_name:
                out_name = args.out_name
            else:
                out_name = '{task}_{split}_losses_normed.pkl'.format(
                    task=given_task, split=args.data_split)
            save_path = os.path.join(save_dir, out_name)

            loss_names_to_vals = {}
            with open(save_path, 'wb') as f:
                loss_names_to_vals['file_indexes'] = filenames
                loss_names_to_vals['global_step'] = 0
                loss_names_to_vals['l1_loss'] = losses
                pickle.dump(loss_names_to_vals, f)

            if args.out_dir:
                os.makedirs(args.out_dir, exist_ok=True)
                os.system("sudo cp {fp} {out}/".format(fp=save_path,
                                                       out=args.out_dir))
            else:
                if transfer:
                    copy_to = cfg['log_root']
                else:
                    copy_to = os.path.join(cfg['log_root'], given_task)
                os.system("sudo mv {fp} {dst}/".format(fp=save_path,
                                                       dst=copy_to))
                print("sudo mv {fp} {dst}/".format(fp=save_path, dst=copy_to))

            print('saved losses to {0}'.format(save_path))
            print('time to extract %d epochs: %.3f hrs' %
                  (cfg['num_epochs'], end_train_time / (60 * 60)))
        finally:
            utils.request_data_loading_end(training_runners)
            utils.end_data_loading_and_sess(training_runners)