def convert_to_tf_record(data_directory: str):
    """Convert the TF MNIST Dataset to TFRecord formats
    
    Args:
        data_directory: The directory where the TFRecord files should be stored
    """

    mnist = input_data.read_data_sets("/tmp/tensorflow/mnist/input_data",
                                      reshape=False)

    num_validation_examples, rows, cols, depth = convert_to(
        mnist.validation, 'validation', data_directory)
    num_train_examples, rows, cols, depth = convert_to(mnist.train,
                                                       'train',
                                                       data_directory,
                                                       num_shards=10)
    num_test_examples, rows, cols, depth = convert_to(mnist.test, 'test',
                                                      data_directory)
    # Save datasets properties in json file
    sizes = {
        'height': rows,
        'width': cols,
        'depth': depth,
        'vali_size': num_validation_examples,
        'train_size': num_train_examples,
        'test_size': num_test_examples
    }
    save_dict_to_json(sizes, os.path.join(data_directory,
                                          'dataset_params.json'))
Exemple #2
0
def evaluate(model_spec, model_dir, params, restore_from):
    """Evaluate the model

    Args:
        model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Initialize the lookup table
        sess.run(model_spec['variable_init_op'])

        # Reload weights from the weights subdirectory
        save_path = os.path.join(model_dir, restore_from)
        if os.path.isdir(save_path):
            save_path = tf.train.latest_checkpoint(save_path)
        saver.restore(sess, save_path)

        # Evaluate
        num_steps = (params.eval_size + params.batch_size -
                     1) // params.batch_size
        metrics = evaluate_sess(sess, model_spec, num_steps)
        metrics_name = '_'.join(restore_from.split('/'))
        save_path = os.path.join(model_dir,
                                 "metrics_test_{}.json".format(metrics_name))
        save_dict_to_json(metrics, save_path)
Exemple #3
0
def evaluate(model_spec, pipeline, model_dir, params, restore_from):
  """
  Evaluate the model on the test dataset

  :param model_spec: (Dictionary), structure that contains the graph operations or nodes needed for testing
  :param pipeline: (DataLoader), Testing input pipeline
  :param model_dir: (String), directory containing config, weights and logs
  :param params: (Params), contains hyper-parameters of the model. Must define: num_epochs, batch_size, save_summary_steps, ... etc
  :param restore_from: (String), Directory of file containing weights to restore the graph
  """
  # Reload weights from the saved file
  test_writer = SummaryWriter(os.path.join(model_dir, 'test_summaries'))

  if not os.path.exists(restore_from):
    raise FileNotFoundError("File doesn't exist {}".format(restore_from))

  checkpoint = torch.load(restore_from, map_location='cpu')
  model_spec['models']['model_G'].load_state_dict(checkpoint['G_state_dict'])
  model_spec['models']['model_D'].load_state_dict(checkpoint['D_state_dict'])

  # Inference
  test_metrics = evaluate_session(model_spec, pipeline, test_writer, params)
  test_writer.flush()
  test_writer.close()

  save_path = os.path.join(model_dir, "metrics_test_best.json")
  save_dict_to_json(test_metrics, save_path)
def main():
    tfrecords_folders = ['OHSUMED'
                         ]  # 'OHSUMED', 'MQ2007', 'MSLR-WEB10K', 'MSLR-WEB30K'
    folds = 5
    for tfrecords_folder in tfrecords_folders:
        for fold in range(1, folds + 1):
            write2folder = os.path.join(TF_RANK_DATA, tfrecords_folder,
                                        str(fold))
            if not os.path.exists(write2folder):
                os.makedirs(write2folder)
            # use eval in the write part of tfrecords for now
            eval_size, eval_feature_dim, eval_doc_count = convert(
                tfrecords_folder, 'vali', fold)
            test_size, test_feature_dim, test_doc_count = convert(
                tfrecords_folder, 'test', fold)
            train_size, train_feature_dim, train_doc_count = convert(
                tfrecords_folder, 'train', fold)
            # Save datasets properties in json file
            sizes = {
                'feature_dim': train_feature_dim,
                'train_size': train_size,
                'train_doc_count': train_doc_count,
                'eval_size': eval_size,
                'eval_doc_count': eval_doc_count,
                'test_size': test_size,
                'test_doc_count': test_doc_count
            }
            save_dict_to_json(
                sizes, os.path.join(write2folder, 'dataset_params.json'))
def predict(model_spec, model_save_dir, params,
            restore_from):  #TODO: return or save a prediction as json
    """predict with restored model
    Args:
        inp: #TODO
        model_spec: (dict)
        model_dir: (str) the directory where the config, weights and log are stored
        params: (Params)
        restore_from: (str) the directory or ckpt where the weights are stored to restore the graph
    """
    assert (os.path.isdir(model_save_dir) and os.path.exists(restore_from)
            ), "the saver dir/file does not exits at '{}/{}'".format(
                model_dir, restore_from)

    saver = tf.train.Saver()

    with tf.compat.v1.Session() as sess:
        sess.run(model_spec['variable_init_op'])

        # reload the weights
        save_path = os.path.join(model_save_dir, restore_from)

        if os.path.isdir(save_path):
            save_path = tf.train.latest_checkpoint(
                save_path
            )  # If restore_from is a directory, get the latest ckpt
        saver.restore(sess, save_path)

        num_steps = (params.val_size + params.batch_size -
                     1) // params.batch_size
        metrics = _test_sess(sess, model_spec, num_steps)
        metrics_name = '_'.join(restore_from.split('/'))
        save_path = os.path.join(model_dir,
                                 "metrics_test_{}.json".format(metrics_name))
        save_dict_to_json(metrics, save_path)
Exemple #6
0
def convert_to_tf_record(data_directory: str, dataset_name: str):
    """Convert the TF MNIST Dataset to TFRecord formats
    
    Args:
        data_directory: The directory where the TFRecord files should be stored
    """
    dataset_parent_path = "/tmp/tensorflow"
    maybe_download_and_extract(dataset_parent_path, dataset_name)
    cifar10_train = get_data_set(dataset_parent_path, dataset_name, 'train')
    cifar10_validation = get_data_set(dataset_parent_path, dataset_name,
                                      'validation')
    cifar10_test = get_data_set(dataset_parent_path, dataset_name, 'test')

    num_validation_examples, rows, cols, depth = convert_to(
        cifar10_validation, 'validation', data_directory)
    num_train_examples, rows, cols, depth = convert_to(cifar10_train,
                                                       'train',
                                                       data_directory,
                                                       num_shards=10)
    num_test_examples, rows, cols, depth = convert_to(cifar10_test, 'test',
                                                      data_directory)
    # Save datasets properties in json file
    sizes = {
        'height': rows,
        'width': cols,
        'depth': depth,
        'vali_size': num_validation_examples,
        'train_size': num_train_examples,
        'test_size': num_test_examples
    }
    save_dict_to_json(sizes, os.path.join(data_directory,
                                          'dataset_params.json'))
Exemple #7
0
def train():
    # Set the logger
    set_logger(os.path.join(params['model_dir'], 'train.log'))
    # log params
    logging.info(params)

    # Load vacabulary
    vocab = tf.contrib.lookup.index_table_from_file(vocab_path,
                                                    num_oov_buckets=1)

    # Create the input data pipeline
    logging.info('Creating the datasets...')
    train_input_words = load_dataset_from_text(data_dir, train_input_filename,
                                               vocab)
    train_context_words = load_dataset_from_text(data_dir,
                                                 train_context_filename, vocab)

    # Create the iterator over the dataset
    train_inputs = input_fn('train', train_input_words, train_context_words,
                            params)
    eval_inputs = input_fn('eval', train_input_words, train_context_words,
                           params)
    logging.info("- done")

    # Define the model
    logging.info('Creating the model...')
    train_model_spec = model_fn('train',
                                train_inputs,
                                params,
                                reuse=tf.AUTO_REUSE)
    eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
    logging.info('- done.')

    # Train the model
    logging.info('Starting training for {} epochs'.format(
        params['num_epochs']))
    normalized_embedding_matrix = train_and_evaluate(train_model_spec,
                                                     eval_model_spec, params)

    save_dict_to_json(params, params['model_dir'] + '/params.json')
    pd.DataFrame(normalized_embedding_matrix).to_csv(os.path.join(
        params['model_dir'], 'normalized_embedding_matrix.tsv'),
                                                     index=False,
                                                     header=None,
                                                     sep='\t')
Exemple #8
0
def train_and_evaluation(train_model_spec, eval_model_spec, model_dir, params, restore_from=None):
    ''' Args:
            train_model_spec: (dict) contain graph and operation or nodes needed for training
            eval_model_spec: (dict) contain graph and operation or nodes needed for evaluation
            model_dir: path contain trained model
            restore_from: (string) dir or file contain weights to restore the graph
    '''

    last_saver = tf.train.Saver()
    best_saver = tf.train.Saver(max_to_keep=1)
    begin_epoch = 0

    with tf.Session as sess:
        sess.run(train_model_spec['variable_init_op'])
        if restore_from not None:
            logging.info('Restore parameter from {}'.format(restore_from))
            if os.path.isdir(restore_from):
                restore_from = tf.train.latest_checkpoint(restore_from)
                begin_epoch = int(restore_from.split('-')[-1])
            last_saver.restore(sess, restore_from)

        train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summary'), sess.graph)
        eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summary'), sess.graph)

        best_eval_acc= 0.0
        for epoch in range(begin_epoch, begin_epoch+params.num_epochs):
            logging.info('Epoch {}/{}'.format(epoch+1, begin_epoch +params.num_epochs))
            num_steps = (params.train_size + params.batches_size -1) // params.batches_size
            metrics = evaluate_sess(sess, eval_model_spec, num_step, eval_writer)

            # If best_val, best_save path

            eval_acc = metrics['accuracy']
            if eval_acc >=best_eval_acc:
                best_eval_acc = eval_acc
                # Save weights 
                best_save_path = os.path.join(model_dit , 'best_weights', 'after_epoch')
                best_save_path = best_saver.save(sess, best_save_path, global_step=epoch+1)
                logging.info('- Found new best accuracy, saving in {}'.format(best_save_path))
                # save best eval metrics
                best_json_path = os.path.join(model_dir, 'metrics_eval_best_weights.json')
                save_dict_to_json(metrics, best_json_path)
            
            last_json_path = os.path.join(model_dir, 'metrics_eval_last_weights.json')
            save_dict_to_json =(metrics, last_json_path)
Exemple #9
0
def evaluate(model_spec, model_dir, params, restore_from):
    """
    Evaluate the model
    --
    Args:
        model_spec: (dict) contains the graph operations
                    or nodes needed for evaluation
        model_dir: (string) directory containing config,
                   weights and log
        params: (Params) contains hyperparameters of the model
        restore_from: (string) directory constaining weights
                      to restore the graph
    """
    # Initialize tf.Saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(model_spec['variable_init_op'])

        # Reload weights from the weights subdirectory
        if os.listdir(restore_from):
            save_path = tf.train.latest_checkpoint(restore_from)
        saver.restore(sess, save_path)

        # Evaluate
        num_steps = int(params.valid_size / params.batch_size)
        metrics = evaluate_sess(sess, model_spec, num_steps)
        wrong_res, right_res = results_dict(sess, model_spec, num_steps)

        wrong_res['filenames'] = [
            el.decode().split('\\')[-1] for el in wrong_res['filenames']
        ]
        right_res['filenames'] = [
            el.decode().split('\\')[-1] for el in right_res['filenames']
        ]
        save_dict_to_json(metrics, model_dir + '/test_metrics.json')
        save_dict_to_json(wrong_res, model_dir + '/wrong_res.json')
        save_dict_to_json(right_res, model_dir + '/right_res.json')
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       learner_id=0,
                       restore_from=None,
                       global_epoch=1):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0
    with tf.Session() as sess:
        # Initialize model variables
        sess.run(train_model_spec['variable_init_op'])
        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'vali_summaries'), sess.graph)
        best_json_path = os.path.join(model_dir,
                                      "metrics_eval_best_weights.json")

        best_eval_metric = 0.0  # ndcg_1
        # best_loss_metric = float('inf')
        second_eval_metric = 0.0  # ndcg_3
        third_eval_metric = 0.0  # ndcg_5
        forth_eval_metric = 0.0  # ndcg_10
        # global_epoch = 0
        # Reload weights from directory if specified
        # restor from the previous learner
        if restore_from is not None:
            save_path = os.path.join(model_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
                # begin_at_epoch = int(restore_from.split('-')[-1])
            logging.info("Restoring parameters from {}".format(save_path))
            # last_saver = tf.train.import_meta_graph(save_path+".meta")
            pretrained_include = ['model/mlp']
            if params.loss_fn == 'urrank':
                pretrained_include = ['model/ur']
            for i in range(1, learner_id):
                pretrained_include.append('residual_mlp_{}'.format(learner_id))

            pretrained_vars = tf.contrib.framework.get_variables_to_restore(
                include=pretrained_include)
            pretrained_saver = tf.train.Saver(pretrained_vars,
                                              name="pretrained_saver")
            pretrained_saver.restore(sess, save_path)
            best_eval_metric, second_eval_metric, third_eval_metric, forth_eval_metric = \
            load_best_ndcgs(best_json_path)
        # for each learner
        early_stopping_count = 0
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            if early_stopping_count == int(params.early_stoping_epochs):
                logging.info("Early stopping at learner {}, epoch {}/{}".format(learner_id, epoch + 1, \
                    begin_at_epoch + params.num_epochs))
                break
            # Run one epoch
            logging.info("Learner {}, Epoch {}/{}".format(learner_id, epoch + 1, \
                begin_at_epoch + params.num_epochs))
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_spec, num_steps, train_writer, params)
            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
            last_saver.save(sess, last_save_path, global_step=global_epoch)
            # Evaluate for one epoch on validation set
            num_steps = (params.eval_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_spec, num_steps,
                                    eval_writer, params)
            # If best_eval, best_save_path
            # eval_metric = metrics['dcg']
            # loss_metric = metrics['loss']
            eval_metric = round(metrics['ndcg_1'], 6)
            eval_metric_2 = round(metrics['ndcg_3'], 6)
            eval_metric_3 = round(metrics['ndcg_5'], 6)
            eval_metric_4 = round(metrics['ndcg_10'], 6)
            # eval_metric = metrics['ndcg_1']
            # eval_metric_2 = metrics['ndcg_3']
            # eval_metric_3 = metrics['ndcg_5']
            # eval_metric_4 = metrics['ndcg_10']
            eval_metrics = [
                eval_metric, eval_metric_2, eval_metric_3, eval_metric_4
            ]
            best_eval_metrics = [best_eval_metric, second_eval_metric, third_eval_metric, \
            forth_eval_metric]
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # Store new best ndcg_1
                # this worsk better than eval_metric > best_eval_metric
                # and isSavingWeights
                best_eval_metric = eval_metric
                # loss_metric = best_loss_metric
                second_eval_metric = eval_metric_2
                third_eval_metric = eval_metric_3
                forth_eval_metric = eval_metric_4
                # Save weights
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=global_epoch)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                save_dict_to_json(metrics, best_json_path)
                save_dict_to_json({'stopped_at_learner': learner_id}, \
                    os.path.join(model_dir, 'best_weights', 'learner.json'))
            else:
                early_stopping_count = early_stopping_count + 1
            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            global_epoch += 1
    return global_epoch
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       restore_from=None):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0

    with tf.Session() as sess:
        # Initialize model variables
        sess.run(train_model_spec['variable_init_op'])

        # Reload weights from directory if specified
        if restore_from is not None:
            logging.info("Restoring parameters from {}".format(restore_from))
            if os.path.isdir(restore_from):
                restore_from = tf.train.latest_checkpoint(restore_from)
                begin_at_epoch = int(restore_from.split('-')[-1])
            last_saver.restore(sess, restore_from)

        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'eval_summaries'), sess.graph)

        best_eval_acc = 0.0
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            # Run one epoch
            logging.info("Epoch {}/{}".format(
                epoch + 1, begin_at_epoch + params.num_epochs))
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_spec, num_steps, train_writer, params)

            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            last_saver.save(sess, last_save_path, global_step=epoch + 1)

            # Evaluate for one epoch on validation set
            num_steps = (params.eval_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_spec, num_steps,
                                    eval_writer)

            # If best_eval, best_save_path
            eval_acc = metrics['loss']
            if eval_acc >= best_eval_acc:
                # Store new best accuracy
                best_eval_acc = eval_acc
                # Save weights
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=epoch + 1)
                logging.info("- Found new best loss, saving in {}".format(
                    best_save_path))
                # Save best eval metrics in a json file in the model directory
                best_json_path = os.path.join(
                    model_dir, "metrics_eval_best_weights.json")
                save_dict_to_json(metrics, best_json_path)

            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
Exemple #12
0
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       learner_id=0,
                       restore_from=None,
                       global_epoch=1):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0
    with tf.Session() as sess:
        # Initialize model variables
        # tf.reset_default_graph()
        sess.run(train_model_spec['variable_init_op'])
        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'vali_summaries'), sess.graph)
        best_json_path = os.path.join(model_dir,
                                      "metrics_eval_best_weights.json")
        best_eval_metrics = [0.0, -float('inf')]
        # global_epoch = 0
        # Reload weights from directory if specified
        # restor from the previous learner
        if restore_from is not None:
            save_path = os.path.join(model_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
                begin_at_epoch = int(save_path.split('-')[-1])
                global_epoch = begin_at_epoch
            logging.info("Restoring parameters from {}".format(save_path))
            # last_saver = tf.train.import_meta_graph(save_path+".meta")
            if params.loss_fn == 'retrain_regu_mine':
                pretrained_include = ['model/cnn']
            elif params.loss_fn == 'cnn' and params.finetune:
                pretrained_include = ['model/cnn']
            else:
                pretrained_include = ['model/c_cnn']
                pretrained_include.append('model/cnn')
            # if params.loss_fn=='boost':
            #     pretrained_include = ['model/boost']
            # for i in range(1, learner_id):
            #     pretrained_include.append('residual_mlp_{}'.format(learner_id))
            pretrained_vars = tf.contrib.framework.get_variables_to_restore(
                include=pretrained_include)
            pretrained_saver = tf.train.Saver(pretrained_vars,
                                              name="pretrained_saver")
            pretrained_saver.restore(sess, save_path)
            # if params.num_learners > 1:
            #     best_eval_metrics = load_best_metric(best_json_path)
            #     best_eval_metrics = [best_eval_metrics['accuracy'], -best_eval_metrics['loss']]
        model_summary()
        # for each learner
        early_stopping_count = 0
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            if early_stopping_count == int(params.early_stoping_epochs):
                logging.info("Early stopping at learner {}, epoch {}/{}".format(learner_id, epoch + 1, \
                    begin_at_epoch + params.num_epochs))
                break
            # Run one epoch
            logging.info("Learner {}, Epoch {}/{}".format(learner_id, epoch + 1, \
                begin_at_epoch + params.num_epochs))
            # logging.info(global_epoch)
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_spec, num_steps, train_writer, params)
            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
            last_saver.save(sess, last_save_path, global_step=global_epoch)
            # Evaluate for one epoch on validation set
            num_steps = (params.vali_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_spec, num_steps,
                                    eval_writer, params)
            # If best_eval, best_save_path
            accuracy_metric = round(metrics['accuracy'], 6)
            loss_metric = -round(metrics['loss'], 6)
            # save_batch()
            eval_metrics = [accuracy_metric, loss_metric]
            # logging.info('global_epoch: {}, best_eval_metrics: {}, \
            #     eval_metric: {}', global_epoch, best_eval_metrics, eval_metric)
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # and isSavingWeights
                best_eval_metrics = eval_metrics
                # Save weights
                # trainalbe_vars = {v.name: v for v in tf.trainable_variables() if 'model' in v.name}
                # print(trainalbe_vars.keys())
                if params.loss_fn == 'cnn' or params.loss_fn == 'retrain_regu':
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/c_cnn')
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/c_cnn' in v.name
                    ]
                    update_weights = [tf.assign(c, old) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                '''
                if params.loss_fn == 'boost':

                    cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name]
                    c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/c_cnn')
                    update_weights = [tf.assign(c, old) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)

                    features = train_model_spec['features']
                    labels = train_model_spec['labels']
                    predicted_scores, _ = retrain_lenet(features, params, var_scope='model/c_cnn')
                    residuals = get_residual(labels, predicted_scores)
                    train_model_spec['old_predicted_scores'] = predicted_scores
                    train_model_spec['residuals'] = residuals

                    features = eval_model_spec['features']
                    labels = eval_model_spec['labels']
                    predicted_scores, _ = retrain_lenet(features, params, var_scope='model/c_cnn')
                    residuals = get_residual(labels, predicted_scores)
                    eval_model_spec['old_predicted_scores'] = predicted_scores
                    eval_model_spec['residuals'] = residuals
                    
                    sess.run(train_model_spec['old_predicted_scores'])
                    sess.run(train_model_spec['residuals'])

                    sess.run(eval_model_spec['old_predicted_scores'])
                    sess.run(eval_model_spec['residuals'])
                '''
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=global_epoch)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                save_dict_to_json(metrics, best_json_path)
                save_dict_to_json({'stopped_at_learner': learner_id}, \
                    os.path.join(model_dir, 'best_weights', 'learner.json'))
            else:
                early_stopping_count = early_stopping_count + 1
            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            global_epoch += 1
    return global_epoch
def fit(train_model_spec,
        test_model_spec,
        model_save_dir,
        params,
        config=None,
        restore_from=None):  #TODO included:1
    """
    Args:
        train_model_spec / val_model_spec: (dict) tensorflow ops and nodes needed for training/validation
        model_dir: (str) directory containing config, weights, and log
        params: (Params)
        config: (tf.ConfigProto) configuration of the tf.Session, most likely about GPU options
        restore_from: (str) directory containing weights to restore in graph-reusing mode
    """
    last_saver = tf.train.Saver()  # keep 5 epochs
    best_saver = tf.train.Saver(max_to_keep=1)  # keep 1 best epoch

    with tf.Session(config=config) as sess:
        sess.run(train_model_spec['variable_init_op'])

        if restore_from is not None:  # TODO
            logging.info("Restoring parameters from {}".format(restore_from))
        else:
            begin_at_epoch = 0

        train_writer = tf.compat.v1.summary.FileWriter(
            os.path.join(model_save_dir, 'train_summaries'), sess.graph)
        test_writer = tf.compat.v1.summary.FileWriter(
            os.path.join(model_save_dir, 'test_summaries'), sess.graph)

        best_test_mse = 1000
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            print("Epoch {}/{}".format(epoch + 1,
                                       begin_at_epoch + params.num_epochs))

            ##### for train ####################################
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size  # number of batches per epoch
            _train_sess(sess, train_model_spec, num_steps, train_writer,
                        params)

            last_saver_path = os.path.join(model_save_dir, 'last_weights',
                                           'after-epoch')
            last_saver.save(
                sess, last_saver_path, global_step=epoch +
                1)  # save as ".../last_weights/after-epoch-{epoch+1}"
            ##### (end) ########################################

            ##### for test #####################################
            num_steps = (params.test_size + params.batch_size -
                         1) // params.batch_size  # number of batches per epoch
            metrics_eval = _test_sess(sess, test_model_spec, num_steps,
                                      test_writer)

            test_mse = metrics_eval['MSE']
            if test_mse < best_test_mse:
                best_test_mse = test_mse
                best_saver_path = os.path.join(model_save_dir, 'best_weights',
                                               'after-epoch')
                best_saver.save(
                    sess, best_saver_path, global_step=epoch +
                    1)  # save as ".../best_weights/after-epoch-{epoch+1}"
                print("--> Found a new best MSE, saving in {}-{}".format(
                    best_saver_path, epoch + 1))

                best_json_path = os.path.join(
                    model_save_dir, "metrics_eval_at_best_weights.json")
                save_dict_to_json(metrics_eval, best_json_path)
            ##### (end) #########################################

            last_json_path = os.path.join(model_save_dir,
                                          "metrics_eval_at_last_weights.json")
            save_dict_to_json(metrics_eval, last_json_path)
Exemple #14
0
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       restore_from=None,
                       global_epoch=1):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0
    # MAB weight sampling
    num_clusters = params.num_clusters  #10
    rewards = [0] * num_clusters
    weight_numbers_of_selections = [0] * num_clusters
    weight_sums_of_reward = [0] * num_clusters
    weight_arm_weights = [1] * num_clusters
    weight_max_upper_bound = 0
    old_index = 0
    old_loss_val = 0
    with tf.Session() as sess:
        # Initialize model variables
        sess.run(train_model_spec['variable_init_op'])
        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'vali_summaries'), sess.graph)
        best_json_path = os.path.join(model_dir,
                                      "metrics_eval_best_weights.json")
        best_eval_metrics = [0.0, -float('inf')]
        global_epoch = 0
        # Reload weights from directory if specified
        # restor from the previous learner
        if restore_from is not None:
            save_path = os.path.join(model_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
                begin_at_epoch = int(save_path.split('-')[-1])
                global_epoch = begin_at_epoch
            logging.info("Restoring parameters from {}".format(save_path))
            pretrained_include = get_pretrained_include(params)
            pretrained_vars = tf.contrib.framework.get_variables_to_restore(
                include=pretrained_include)
            pretrained_saver = tf.train.Saver(pretrained_vars,
                                              name="pretrained_saver")
            pretrained_saver.restore(sess, save_path)
            # last_best_eval_metric = load_best_metric(best_json_path)
            # best_eval_metrics = [last_best_eval_metric['accuracy'], -last_best_eval_metric['loss']]
            logging.info(best_eval_metrics)
        model_summary()
        # for each learner
        num_train_steps = (params.train_size + params.batch_size -
                           1) // params.batch_size
        num_train_steps = int(num_train_steps)
        if params.finetune:
            # initial rewards for all arms
            for i in range(num_clusters):
                old_index = i
                _, _, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_initial_sess(sess, train_model_spec, num_train_steps, \
                train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val)
        # now real rl
        early_stopping_count = 0
        # epoch_cut_off = int((begin_at_epoch + params.num_epochs) * params.epoch_cutoff)
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            if early_stopping_count == int(params.early_stoping_epochs):
                logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \
                    begin_at_epoch + params.num_epochs))
                break
            # Run one epoch
            logging.info("Epoch {}/{}".format(epoch + 1, \
                begin_at_epoch + params.num_epochs))
            # Compute number of batches in one epoch (one full pass over the training set)

            # MAB data sampling
            batch_loss, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_sess(sess, train_model_spec, num_train_steps, \
                train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val)

            # sum_loss = batch_loss
            # # sum_loss = [s+n for (s, n) in zip(batch_loss, sum_loss)]
            # sum_loss = [float(v) for v in sum_loss]
            # # logging.info('sum_loss :\n {}'.format(sum_loss))
            # consk = int(params.consk)
            # for i in range(num_train_steps):
            #     index, reward, numbers_of_selections, sums_of_reward, \
            #     max_upper_bound = rl(params, sum_loss, numbers_of_selections, \
            #         sums_of_reward, max_upper_bound, \
            #         (epoch - begin_at_epoch + 1) / consk, arm_weights)
            #     if params.rl == 'EXP3':
            #         arm_weights = sums_of_reward
            #     # logging.info('numbers_of_selections at i:\n {}'.format(numbers_of_selections))
            #     total_reward += reward

            # Save weights
            # if epoch >= epoch_cut_off:
            #     # cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn/weights1_1' in v.name]
            #     # cnn_vars = tf.get_variable('model/cnn/weights1_1')
            #     save_var(sess, 'weights1_1', epoch)
            #     save_var(sess, 'weights1_2', epoch)
            #     # save_var(sess, 'weights3_1', epoch)
            #     save_var(sess, 'weights3_2', epoch)
            save_var(sess, 'weights1_1', epoch)
            save_var(sess, 'weights1_2', epoch)
            save_var(sess, 'weights3_1', epoch)
            save_var(sess, 'weights3_2', epoch)
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            last_saver.save(sess, last_save_path, global_step=global_epoch)
            # # Evaluate for one epoch on validation set
            num_vali_steps = (params.vali_size + params.batch_size -
                              1) // params.batch_size
            num_vali_steps = int(num_vali_steps)
            metrics = evaluate_sess(sess, eval_model_spec, num_vali_steps,
                                    eval_writer, params)
            # If best_eval, best_save_path
            accuracy_metric = round(metrics['accuracy'], 6)
            loss_metric = -round(metrics['loss'], 6)
            # save_batch()
            eval_metrics = [accuracy_metric, loss_metric]
            # logging.info('global_epoch: {}, best_eval_metrics: {}, \
            #     eval_metric: {}', global_epoch, best_eval_metrics, eval_metric)
            # logging.info('isSavingWeights(eval_metrics, best_eval_metrics) {}'.\
            # format(isSavingWeights(eval_metrics, best_eval_metrics)))
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # and isSavingWeights
                best_eval_metrics = eval_metrics
                # Save weights
                if params.loss_fn == 'cnn' and not params.use_kfac:
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/c_cnn' in v.name
                    ]
                    update_weights = [tf.assign(c, old) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                    best_save_path = os.path.join(model_dir, 'best_weights',
                                                  'after-epoch')
                    best_save_path = best_saver.save(sess,
                                                     best_save_path,
                                                     global_step=global_epoch)
                    logging.info(
                        "- Make a copy of cnn vars, saving in {}".format(
                            best_save_path))
                elif params.loss_fn == 'retrain_regu_mine3':
                    # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn')
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/mask' in v.name
                    ]
                    update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                    best_save_path = os.path.join(model_dir, 'best_weights',
                                                  'after-epoch')
                    best_save_path = best_saver.save(sess,
                                                     best_save_path,
                                                     global_step=global_epoch)
                    logging.info("- Updated cnn vars, saving in {}".format(
                        best_save_path))
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=global_epoch)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                save_dict_to_json(metrics, best_json_path)
            else:
                early_stopping_count = early_stopping_count + 1
            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            global_epoch += 1
        # update in the end is wrong as not the best weights are copied
        '''
        if params.loss_fn == 'cnn' and not params.use_kfac:
            cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name]
            c_cnn_vars=[v for v in tf.trainable_variables() if 'model/c_cnn' in v.name]
            update_weights = [tf.assign(c, old) for (c, old) in \
            zip(c_cnn_vars, cnn_vars)]
            sess.run(update_weights)
            best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
            best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch)
            logging.info("- Make a copy of cnn vars, saving in {}".format(best_save_path))             
        elif params.loss_fn == 'retrain_regu_mine3':
            # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn')
            c_cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name]
            cnn_vars=[v for v in tf.trainable_variables() if 'model/mask' in v.name]
            update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \
            zip(c_cnn_vars, cnn_vars)]
            sess.run(update_weights)
            best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
            best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch)
            logging.info("- Updated cnn vars, saving in {}".format(best_save_path))
        '''
        begin_at_epoch = global_epoch
        early_stopping_count = 0
        sum_loss = [0] * num_train_steps
        numbers_of_selections = [0] * num_train_steps
        # UCB specific
        sums_of_reward = [0] * num_train_steps
        arm_weights = [1] * num_train_steps
        # UCB specific
        max_upper_bound = 0
        total_reward = 0
        for epoch in range(begin_at_epoch,
                           begin_at_epoch + params.num_epochs2):
            # if early_stopping_count == int(params.early_stoping_epochs):
            #     logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \
            #         begin_at_epoch + params.num_epochs))
            #     break
            # Run one epoch
            logging.info("Epoch {}/{}".format(epoch + 1, \
                begin_at_epoch + params.num_epochs2))

            # MAB data sampling
            batch_loss, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val = train_sess(sess, train_model_spec, num_train_steps, \
                train_writer, params, old_index, weight_numbers_of_selections, weight_sums_of_reward, weight_arm_weights, weight_max_upper_bound, old_loss_val)

            sum_loss = batch_loss
            consk = int(params.consk)
            # logging.info('sum_loss :\n {}, length: {}'.format(sum_loss, len(sum_loss)))
            # logging.info('sample numbers_of_selections at i:\n {}, length: {}'.format(numbers_of_selections, len(numbers_of_selections)))

            for i in range(num_train_steps):
                index, reward, numbers_of_selections, sums_of_reward, \
                max_upper_bound = rl(params, sum_loss, numbers_of_selections, \
                    sums_of_reward, max_upper_bound, \
                    (epoch - begin_at_epoch + 1) / consk, arm_weights)
                # logging.info('sample numbers_of_selections at i:\n {}'.format(numbers_of_selections))
                total_reward += reward
            # logging.info('len of sum_loss: {}'.format(len(sum_loss)))
            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
            last_saver.save(sess, last_save_path, global_step=global_epoch)
            metrics = evaluate_sess(sess, eval_model_spec, num_vali_steps,
                                    eval_writer, params)
            # If best_eval, best_save_path
            accuracy_metric = round(metrics['accuracy'], 6)
            loss_metric = -round(metrics['loss'], 6)
            # save_batch()
            eval_metrics = [accuracy_metric, loss_metric]
            # logging.info('global_epoch: {}, best_eval_metrics: {}, \
            #     eval_metric: {}', global_epoch, best_eval_metrics, eval_metric)
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # and isSavingWeights
                best_eval_metrics = eval_metrics
                # Save weights
                # trainalbe_vars = {v.name: v for v in tf.trainable_variables() if 'model' in v.name}
                # print(trainalbe_vars.keys())
                if params.loss_fn == 'cnn' and not params.use_kfac:
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/c_cnn' in v.name
                    ]
                    update_weights = [tf.assign(c, old) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                # global_epoch = int(params.num_learners) * int(params.num_epochs) + epoch + 1
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=global_epoch)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                save_dict_to_json(metrics, best_json_path)
            else:
                early_stopping_count = early_stopping_count + 1
            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            global_epoch += 1
        # logging.info('num_vali_steps: {}'.format(num_vali_steps))
        # logging.info('len of sum_loss: {}'.format(len(sum_loss)))
        # logging.info('numbers_of_selections:\n {}'.format(numbers_of_selections))
        logging.info(
            'numbers_of_selections:\n {}'.format(numbers_of_selections))

        logging.info('weight_numbers_of_selections:\n {}'.format(
            weight_numbers_of_selections))
        sorted_index = sorted(range(num_train_steps),
                              key=lambda k: numbers_of_selections[k],
                              reverse=True)
        # top_sorted_index = sorted_index[0: int(num_train_steps*params.top_ratio)+1]
        sample_batchs = (params.sample_size + params.batch_size -
                         1) // params.batch_size
        top_sorted_index = sorted_index[0:int(sample_batchs) + 1]
        logging.info('len(top_sorted_index) in training: {}'.format(
            len(top_sorted_index)))
        take_train_samples_sess(sess, eval_model_spec, num_train_steps, params,
                                top_sorted_index)
    return global_epoch
Exemple #15
0
def evaluate_on_train(eval_model_spec,
                      model_dir,
                      params,
                      restore_from,
                      global_epoch=0):
    """Train the model and evaluate every epoch.
    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0
    with tf.Session() as sess:
        # Initialize model variables
        sess.run(eval_model_spec['variable_init_op'])
        best_json_path = os.path.join(model_dir,
                                      "metrics_eval_best_weights.json")
        # Reload weights from directory if specified
        # restor from the previous learner
        if restore_from is not None:
            save_path = os.path.join(model_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
                begin_at_epoch = int(save_path.split('-')[-1])
                global_epoch = begin_at_epoch + 1
            logging.info("Restoring parameters from {}".format(save_path))
            # last_saver = tf.train.import_meta_graph(save_path+".meta")
            pretrained_include = get_pretrained_include(params)
            pretrained_vars = tf.contrib.framework.get_variables_to_restore(
                include=pretrained_include)
            pretrained_saver = tf.train.Saver(pretrained_vars,
                                              name="pretrained_saver")
            pretrained_saver.restore(sess, save_path)
        model_summary()
        best_saver = tf.train.Saver(max_to_keep=1)
        # Run one epoch
        logging.info("Epoch {}/{}".format(begin_at_epoch + 1, \
            begin_at_epoch + 1))
        # Compute number of batches in one epoch (one full pass over the training set)
        # Evaluate for one epoch on validation set
        num_steps = (params.train_size + params.batch_size -
                     1) // params.batch_size
        metrics = evaluate_on_train_sess(sess, eval_model_spec, num_steps,
                                         params)
        # loss_evaluate_on_train = sess.run(eval_model_spec['metrics']['loss'])
        # logging.info('loss_evaluate_on_train')
        # print(loss_evaluate_on_train)
        if params.loss_fn == 'cnn' or params.loss_fn == 'retrain_regu':
            cnn_vars = [
                v for v in tf.trainable_variables() if 'model/cnn' in v.name
            ]
            # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/c_cnn')
            c_cnn_vars = [
                v for v in tf.trainable_variables() if 'model/c_cnn' in v.name
            ]
            update_weights = [tf.assign(c, old) for (c, old) in \
            zip(c_cnn_vars, cnn_vars)]
            sess.run(update_weights)
        # # Save latest eval metrics in a json file in the model directory
        eval_on_train_json_path = os.path.join(model_dir,
                                               "metrics_eval_on_train.json")
        save_dict_to_json(metrics, eval_on_train_json_path)
        best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
        best_save_path = best_saver.save(sess,
                                         best_save_path,
                                         global_step=global_epoch)
        logging.info("- Found new best metric score, saving in {}".format(
            best_save_path))
    return global_epoch
Exemple #16
0
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       restore_from=None):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0

    with tf.Session() as sess:
        # Initialize model variables
        sess.run(train_model_spec['variable_init_op'])

        # Reload weights from directory if specified
        if restore_from is not None:
            logging.info("Restoring parameters from {}".format(restore_from))
            if os.path.isdir(restore_from):
                restore_from = tf.train.latest_checkpoint(restore_from)
                begin_at_epoch = int(restore_from.split('-')[-1])
            last_saver.restore(sess, restore_from)

        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'vali_summaries'), sess.graph)

        best_eval_metric = 0.0  # ndcg_1
        # best_loss_metric = float('inf')
        second_eval_metric = 0.0  # ndcg_3
        third_eval_metric = 0.0  # ndcg_5
        forth_eval_metric = 0.0  # ndcg_10

        early_stopping_count = 0

        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            if early_stopping_count == 200:
                logging.info("Early stopping at epoch {}/{}".format(
                    epoch + 1, begin_at_epoch + params.num_epochs))
                break
            # Run one epoch
            logging.info("Epoch {}/{}".format(
                epoch + 1, begin_at_epoch + params.num_epochs))
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_spec, num_steps, train_writer, params)

            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            last_saver.save(sess, last_save_path, global_step=epoch + 1)

            # Evaluate for one epoch on validation set
            num_steps = (params.eval_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_spec, num_steps,
                                    eval_writer)

            # If best_eval, best_save_path
            # eval_metric = metrics['dcg']
            eval_metric = metrics['ndcg_1']
            # loss_metric = metrics['loss']
            eval_metric_2 = metrics['ndcg_3']
            eval_metric_3 = metrics['ndcg_5']
            eval_metric_4 = metrics['ndcg_10']
            eval_metrics = [
                eval_metric, eval_metric_2, eval_metric_3, eval_metric_4
            ]
            best_eval_metrics = [
                best_eval_metric, second_eval_metric, third_eval_metric,
                forth_eval_metric
            ]
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # Store new best ndcg_1
                # this worsk better than eval_metric > best_eval_metric
                # and isSavingWeights
                best_eval_metric = eval_metric
                # loss_metric = best_loss_metric
                second_eval_metric = eval_metric_2
                third_eval_metric = eval_metric_3
                forth_eval_metric = eval_metric_4
                # Save weights
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=epoch + 1)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                best_json_path = os.path.join(
                    model_dir, "metrics_eval_best_weights.json")
                save_dict_to_json(metrics, best_json_path)
            else:
                early_stopping_count = early_stopping_count + 1

            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
def train_and_evaluate(train_model_spec,
                       eval_model_spec,
                       model_dir,
                       params,
                       restore_from=None,
                       global_epoch=1):
    """Train the model and evaluate every epoch.

    Args:
        train_model_spec: (dict) contains the graph operations or nodes needed for training
        eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: (Params) contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph
    """
    # Initialize tf.Saver instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
    begin_at_epoch = 0
    with tf.Session() as sess:
        # Initialize model variables
        sess.run(train_model_spec['variable_init_op'])
        # For tensorboard (takes care of writing summaries to files)
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'vali_summaries'), sess.graph)
        best_json_path = os.path.join(model_dir,
                                      "metrics_eval_best_weights.json")
        best_eval_metrics = [0.0, -float('inf')]
        global_epoch = 0
        # Reload weights from directory if specified
        # restor from the previous learner
        if restore_from is not None:
            save_path = os.path.join(model_dir, restore_from)
            if os.path.isdir(save_path):
                save_path = tf.train.latest_checkpoint(save_path)
                begin_at_epoch = int(save_path.split('-')[-1])
                global_epoch = begin_at_epoch
            logging.info("Restoring parameters from {}".format(save_path))
            pretrained_include = get_pretrained_include(params)
            pretrained_vars = tf.contrib.framework.get_variables_to_restore(
                include=pretrained_include)
            pretrained_saver = tf.train.Saver(pretrained_vars,
                                              name="pretrained_saver")
            pretrained_saver.restore(sess, save_path)
            # last_best_eval_metric = load_best_metric(best_json_path)
            # best_eval_metrics = [last_best_eval_metric['accuracy'], -last_best_eval_metric['loss']]
            logging.info(best_eval_metrics)
        model_summary()
        # for each learner
        early_stopping_count = 0
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            if early_stopping_count == int(params.early_stoping_epochs):
                logging.info("Early stopping at epoch {}/{}".format(epoch + 1, \
                    begin_at_epoch + params.num_epochs))
                break
            # Run one epoch
            logging.info("Epoch {}/{}".format(epoch + 1, \
                begin_at_epoch + params.num_epochs))
            # Compute number of batches in one epoch (one full pass over the training set)
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_spec, num_steps, train_writer, params)
            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            last_saver.save(sess, last_save_path, global_step=global_epoch)
            # # Evaluate for one epoch on validation set
            num_steps = (params.vali_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_spec, num_steps,
                                    eval_writer, params)
            # If best_eval, best_save_path
            accuracy_metric = round(metrics['accuracy'], 6)
            loss_metric = -round(metrics['loss'], 6)
            # save_batch()
            eval_metrics = [accuracy_metric, loss_metric]
            # logging.info('global_epoch: {}, best_eval_metrics: {}, \
            #     eval_metric: {}', global_epoch, best_eval_metrics, eval_metric)
            # logging.info('isSavingWeights(eval_metrics, best_eval_metrics) {}'.\
            # format(isSavingWeights(eval_metrics, best_eval_metrics)))
            if isSavingWeights(eval_metrics, best_eval_metrics):
                # rest early_stopping_count
                early_stopping_count = 0
                # and isSavingWeights
                best_eval_metrics = eval_metrics
                # Save weights
                if params.loss_fn == 'cnn' and not params.use_kfac:
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/c_cnn' in v.name
                    ]
                    update_weights = [tf.assign(c, old) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                    best_save_path = os.path.join(model_dir, 'best_weights',
                                                  'after-epoch')
                    best_save_path = best_saver.save(sess,
                                                     best_save_path,
                                                     global_step=global_epoch)
                    logging.info(
                        "- Make a copy of cnn vars, saving in {}".format(
                            best_save_path))
                elif params.loss_fn == 'retrain_regu_mine3':
                    # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn')
                    c_cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/cnn' in v.name
                    ]
                    cnn_vars = [
                        v for v in tf.trainable_variables()
                        if 'model/mask' in v.name
                    ]
                    update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \
                    zip(c_cnn_vars, cnn_vars)]
                    sess.run(update_weights)
                    best_save_path = os.path.join(model_dir, 'best_weights',
                                                  'after-epoch')
                    best_save_path = best_saver.save(sess,
                                                     best_save_path,
                                                     global_step=global_epoch)
                    logging.info("- Updated cnn vars, saving in {}".format(
                        best_save_path))
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=global_epoch)
                logging.info(
                    "- Found new best metric score, saving in {}".format(
                        best_save_path))
                # Save best eval metrics in a json file in the model directory
                save_dict_to_json(metrics, best_json_path)
            else:
                early_stopping_count = early_stopping_count + 1
            # Save latest eval metrics in a json file in the model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)
            global_epoch += 1
        # update in the end is wrong as not the best weights are copied
        '''
        if params.loss_fn == 'cnn' and not params.use_kfac:
            cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name]
            c_cnn_vars=[v for v in tf.trainable_variables() if 'model/c_cnn' in v.name]
            update_weights = [tf.assign(c, old) for (c, old) in \
            zip(c_cnn_vars, cnn_vars)]
            sess.run(update_weights)
            best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
            best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch)
            logging.info("- Make a copy of cnn vars, saving in {}".format(best_save_path))             
        elif params.loss_fn == 'retrain_regu_mine3':
            # c_cnn_vars=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/cnn')
            c_cnn_vars=[v for v in tf.trainable_variables() if 'model/cnn' in v.name]
            cnn_vars=[v for v in tf.trainable_variables() if 'model/mask' in v.name]
            update_weights = [tf.assign(c, tf.multiply(old, c)) for (c, old) in \
            zip(c_cnn_vars, cnn_vars)]
            sess.run(update_weights)
            best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
            best_save_path = best_saver.save(sess, best_save_path, global_step=global_epoch)
            logging.info("- Updated cnn vars, saving in {}".format(best_save_path))
        '''
    return global_epoch
def train_and_evaluate(train_model_specs,
                       eval_model_specs,
                       model_dir,
                       params,
                       restore_from=None):
    """Train the model and evaluate every epoch.

    Args:
        train_model_specs: (dict) contains the graph operations or nodes needed for training
        eval_model_specs: (dict) contains the graph operations or nodes needed for evaluation
        model_dir: (string) directory containing config, weights and log
        params: contains hyperparameters of the model.
                Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
        restore_from: (string) directory or file containing weights to restore the graph.
    """

    # Initialize tf.Saver() instances to save weights during training
    last_saver = tf.train.Saver()  # will keep last 5 epochs
    best_saver = tf.train.Saver(
        max_to_keep=1)  # only keep 1 best checkpoint (based on eval)

    begin_at_epoch = 0
    with tf.Session() as sess:
        # Initialize model vairables
        sess.run(train_model_specs['variable_init_op'])

        # Load the mobilenet pretrain weights
        train_model_specs['mobilenet_init_op'](sess)

        # Reload weights from directory if specified
        if restore_from is not None:
            if os.path.isdir(restore_from):
                restore_from = tf.train.latest_checkpoint(restore_from)
                begin_at_epoch = int(restore_from.split('-')[-1])
            last_saver.restore(sess, restore_from)

        # Create summary writer for train and eval
        train_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'train_summaries'), sess.graph)
        eval_writer = tf.summary.FileWriter(
            os.path.join(model_dir, 'eval_summaries'), sess.graph)

        best_eval_loss = 1000
        for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
            # Run one epoch
            logging.info("Epoch {}/{}".format(
                epoch + 1, begin_at_epoch + params.num_epochs))
            num_steps = (params.train_size + params.batch_size -
                         1) // params.batch_size
            train_sess(sess, train_model_specs, num_steps, params,
                       train_writer)

            # Save weights
            last_save_path = os.path.join(model_dir, 'last_weights',
                                          'after-epoch')
            last_saver.save(sess, last_save_path, global_step=epoch + 1)

            # Evaluate for one epoch on validation set
            num_steps = (params.eval_size + params.batch_size -
                         1) // params.batch_size
            metrics = evaluate_sess(sess, eval_model_specs, num_steps)

            # If best_loss, best_save_path
            eval_loss = metrics['loss']
            if eval_loss <= best_eval_loss:
                # Store new best loss
                best_eval_loss = eval_loss
                # Save weights
                best_save_path = os.path.join(model_dir, 'best_weights',
                                              'after-epoch')
                best_save_path = best_saver.save(sess,
                                                 best_save_path,
                                                 global_step=epoch + 1)
                logging.info("- Found new best accuracy, saving in {}".format(
                    best_save_path))
                best_json_path = os.path.join(
                    model_dir, "metrics_eval_best_weights.json")
                save_dict_to_json(metrics, best_json_path)

            # save lastest eval metric in a json file in model directory
            last_json_path = os.path.join(model_dir,
                                          "metrics_eval_last_weights.json")
            save_dict_to_json(metrics, last_json_path)