def train_and_eval(model): for n in range(FLAGS.train_epochs): tf.compat.v1.logging.info('=' * 30 + ' START EPOCH {} '.format(n + 1) + '=' * 30 + '\n') train_data_list = list_files(FLAGS.train_data) # dir to file list for f in train_data_list: t0 = time.time() tf.compat.v1.logging.info('<EPOCH {}>: Start training {}'.format( n + 1, f)) model.train(input_fn=lambda: input_fn(f, FLAGS.image_train_data, 'train', FLAGS.batch_size), hooks=None, steps=None, max_steps=None, saving_listeners=None) tf.compat.v1.logging.info( '<EPOCH {}>: Finish training {}, take {} mins'.format( n + 1, f, elapse_time(t0))) print('-' * 80) tf.compat.v1.logging.info('<EPOCH {}>: Start evaluating {}'.format( n + 1, FLAGS.eval_data)) t0 = time.time() results = model.evaluate( input_fn=lambda: input_fn(FLAGS.eval_data, FLAGS.image_eval_data, 'eval', FLAGS. batch_size), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path=None, # latest checkpoint in model_dir is used. name=None) tf.compat.v1.logging.info( '<EPOCH {}>: Finish evaluation {}, take {} mins'.format( n + 1, FLAGS.eval_data, elapse_time(t0))) print('-' * 80) # Display evaluation metrics for key in sorted(results): print('{}: {}'.format(key, results[key])) # every epochs_per_eval test the model (use larger test dataset) if (n + 1) % FLAGS.epochs_per_eval == 0: tf.compat.v1.logging.info('<EPOCH {}>: Start testing {}'.format( n + 1, FLAGS.test_data)) results = model.evaluate( input_fn=lambda: input_fn(FLAGS.test_data, FLAGS.image_test_data, 'pred', FLAGS. batch_size), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path= None, # If None, the latest checkpoint in model_dir is used. name=None) tf.compat.v1.logging.info( '<EPOCH {}>: Finish testing {}, take {} mins'.format( n + 1, FLAGS.test_data, elapse_time(t0))) print('-' * 80) # Display evaluation metrics for key in sorted(results): print('{}: {}'.format(key, results[key]))
def dynamic_train(model): """Dynamic train mode. For example: train_data_files: [0301, 0302, 0303, ...] train mode: first take 0301 as train data, 0302 as test data; then keep training take 0302 as train data, 0303 as test data ... """ data_files = list_files(FLAGS.train_data) data_files.sort() assert len(data_files) > 1, 'Dynamic train mode need more than 1 data file' for i in range(len(data_files) - 1): train_data = data_files[i] test_data = data_files[i + 1] tf.compat.v1.logging.info( '=' * 30 + ' START TRAINING DATA: {} '.format(train_data) + '=' * 30 + '\n') for n in range(FLAGS.train_epochs): t0 = time.time() tf.compat.v1.logging.info( 'START TRAIN DATA <{}> <EPOCH {}>'.format(train_data, n + 1)) model.train(input_fn=lambda: input_fn( train_data, FLAGS.image_train_data, 'train', FLAGS.batch_size), hooks=None, steps=None, max_steps=None, saving_listeners=None) tf.compat.v1.logging.info( 'FINISH TRAIN DATA <{}> <EPOCH {}> take {} mins'.format( train_data, n + 1, elapse_time(t0))) print('-' * 80) tf.compat.v1.logging.info( 'START EVALUATE TEST DATA <{}> <EPOCH {}>'.format( test_data, n + 1)) t0 = time.time() results = model.evaluate( input_fn=lambda: input_fn(test_data, FLAGS.image_eval_data, 'eval', FLAGS.batch_size), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path=None, # latest checkpoint in model_dir is used. name=None) tf.compat.v1.logging.info( 'FINISH EVALUATE TEST DATA <{}> <EPOCH {}>: take {} mins'. format(test_data, n + 1, elapse_time(t0))) print('-' * 80) # Display evaluation metrics for key in sorted(results): print('{}: {}'.format(key, results[key]))
def main(unused_argv): print("Using TensorFlow version %s" % tf.__version__) assert "1.4" <= tf.__version__, "TensorFlow r1.4 or later is needed" # if FLAGS.is_distribution: # print("Using distribution tensoflow. Job_name:{} Task_index:{}" # .format(CONFIG.distribution["job_name"], CONFIG.distribution["task_index"])) print('Model type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('Model directory: {}'.format(model_dir)) model = build_estimator(model_dir, FLAGS.model_type) tf.logging.info('Build estimator: {}'.format(model)) checkpoint_path = FLAGS.checkpoint_path or model.latest_checkpoint() if checkpoint_path is None: raise ValueError( 'No model checkpoint found, please check the model dir.') tf.logging.info('Using model checkpoint: {}'.format(checkpoint_path)) print('\n') tf.logging.info('=' * 30 + ' START TESTING' + '=' * 30) s_time = time.time() results = model.evaluate( input_fn=lambda: input_fn(FLAGS.test_data, FLAGS.image_test_data, 'eval', FLAGS.batch_size), steps=None, # Number of steps for which to evaluate model. hooks=None, checkpoint_path=FLAGS. checkpoint_path, # If None, the latest checkpoint is used. name=None) tf.logging.info('=' * 30 + 'FINISH TESTING, TAKE {}'.format(elapse_time(s_time)) + '=' * 30) # Display evaluation metrics print('-' * 80) for key in sorted(results): print('%s: %s' % (key, results[key]))
def main(unused_argv): print("Using TensorFlow version %s" % tf.__version__) assert "1.4" <= tf.__version__, "TensorFlow r1.4 or later is needed" if FLAGS.data_dir is None: raise ValueError("Must specify prediction data_file by --data_dir") print('Model type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('Model directory: {}'.format(model_dir)) model = build_estimator(model_dir, FLAGS.model_type) tf.logging.info('Build estimator: {}'.format(model)) tf.logging.info('=' * 30 + 'START PREDICTION' + '=' * 30) t0 = time.time() predictions = model.predict(input_fn=lambda: input_fn( FLAGS.data_dir, FLAGS.image_data_dir, 'pred', FLAGS.batch_size), predict_keys=None, hooks=None, checkpoint_path=FLAGS.checkpoint_path ) # defaults None to use latest_checkpoint tf.logging.info('=' * 30 + 'FINISH PREDICTION, TAKE {} mins'.format(elapse_time(t0)) + '=' * 30) for pred_dict in predictions: # dict{probabilities, classes, class_ids} class_id = pred_dict['class_ids'][0] probability = pred_dict['probabilities'][class_id] print('\nPrediction is "{}" ({:.1f}%)'.format(class_id, 100 * probability))
def main(unused_argv): print("Using TensorFlow version %s" % tf.__version__) assert "1.4" <= tf.__version__, "TensorFlow r1.4 or later is needed" # if FLAGS.is_distribution: # print("Using distribution tensoflow. Job_name:{} Task_index:{}" # .format(CONFIG.distribution["job_name"], CONFIG.distribution["task_index"])) # model info print('Model type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('Model directory: {}'.format(model_dir)) model = build_estimator(model_dir, FLAGS.model_type) tf.logging.info('Build estimator: {}'.format(model)) checkpoint_path = FLAGS.checkpoint_path or model.latest_checkpoint() if checkpoint_path is None: raise ValueError('No model checkpoint found, please check the model dir.') tf.logging.info('Using model checkpoint: {}'.format(checkpoint_path)) print('-' * 80) tf.logging.info('='*30+' START PREDICTION'+'='*30) t0 = time.time() predictions = model.predict(input_fn=lambda: input_fn(FLAGS.data_dir, FLAGS.image_data_dir, 'pred', FLAGS.batch_size), predict_keys=None, hooks=None, checkpoint_path=checkpoint_path) # defaults None to use latest_checkpoint tf.logging.info('='*30+'FINISH PREDICTION, TAKE {} mins'.format(elapse_time(t0))+'='*30) for pred_dict in predictions: # dict{probabilities, classes, class_ids} class_id = pred_dict['class_ids'][0] probability = pred_dict['probabilities'][class_id] print('\nPrediction is "{}" ({:.1f}%)'.format(class_id, 100 * probability))
def main(unused_argv): print("Using TensorFlow version %s" % tf.__version__) # assert "1.4" <= tf.__version__, "TensorFlow r1.4 or later is needed" if FLAGS.data_dir is None: raise ValueError("Must specify prediction data_file by --data_dir") print('Model type: {}'.format(FLAGS.model_type)) model_dir = os.path.join(FLAGS.model_dir, FLAGS.model_type) print('Model directory: {}'.format(model_dir)) # model = build_estimator(model_dir, FLAGS.model_type) model = build_custom_estimator(model_dir, FLAGS.model_type) tf.compat.v1.logging.info('Build estimator: {}'.format(model)) # weights and other parameters (e.g. Adagrad) of the model name_ls = model.get_variable_names() print_shape = True total_linear_weights = 0 for name in name_ls: if print_shape: shape = model.get_variable_value(name).shape print(name, "\t", shape) if name[:6] == "linear" and \ (name[-7:] == "weights"or name[-4:] == "bias"): total_linear_weights += np.prod(shape) else: print(name) if print_shape: print("Total parameters in linear model: {}".format( total_linear_weights)) # embedding layer look up sample_embedding = model.get_variable_value( 'dnn/input_from_feature_columns/input_layer/ad_cates_embedding/embedding_weights' ) ids = [10, 20, 30] with tf.compat.v1.Session() as sess: lookup = tf.nn.embedding_lookup(params=sample_embedding, ids=ids).eval() print(lookup) # predictions tf.compat.v1.logging.info('=' * 30 + 'START PREDICTION' + '=' * 30) t0 = time.time() predictions = model.predict(input_fn=lambda: input_fn( FLAGS.data_dir, FLAGS.image_data_dir, 'pred', FLAGS.batch_size), predict_keys=None, hooks=None, checkpoint_path=FLAGS.checkpoint_path ) # defaults None to use latest_checkpoint for pred_dict in predictions: # dict{probabilities, classes, class_ids} class_id = pred_dict['class_ids'][0] probability = pred_dict['probabilities'][class_id] print('\nPrediction is "{}" ({:.1f}%)'.format(class_id, 100 * probability)) tf.compat.v1.logging.info( '=' * 30 + 'FINISH PREDICTION, TAKE {} mins'.format(elapse_time(t0)) + '=' * 30)
def train(model): for n in range(FLAGS.train_epochs): tf.logging.info('=' * 30 + ' START EPOCH {} '.format(n + 1) + '=' * 30 + '\n') train_data_list = list_files(FLAGS.train_data) # dir to file list for f in train_data_list: t0 = time.time() tf.logging.info('<EPOCH {}>: Start training {}'.format(n + 1, f)) model.train(input_fn=lambda: input_fn(f, FLAGS.image_train_data, 'train', FLAGS.batch_size), hooks=None, steps=None, max_steps=None, saving_listeners=None) tf.logging.info( '<EPOCH {}>: Finish training {}, take {} mins'.format( n + 1, f, elapse_time(t0)))