def main(_): dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.train_dir): tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) inception_train.train(dataset)
def main(unused_argv=None): dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) inception_eval.evaluate(dataset)
def main(unused_args): assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker' # Extract all the hostnames for the ps and worker jobs to construct the # cluster spec. ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are: %s' % ps_hosts) tf.logging.info('Worker hosts are: %s' % worker_hosts) cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name=FLAGS.job_name, task_index=FLAGS.task_id, protocol=FLAGS.protocol) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec)
def main(unused_argv): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.worker_index) if FLAGS.job_name == "ps": server.join() sys.exit(0) # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec) num_workers = len(worker_hosts) worker_grpc_url = 'grpc://' + worker_hosts[0] print("Worker GRPC URL: %s" % worker_grpc_url) print("Worker index = %d" % FLAGS.worker_index) print("Number of workers = %d" % num_workers)
def main(unused_args): assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker' # Extract all the hostnames for the ps and worker jobs to construct the # cluster spec. ps_hosts = FLAGS.ps_hosts.split(',') worker_hosts = FLAGS.worker_hosts.split(',') tf.logging.info('PS hosts are: %s' % ps_hosts) tf.logging.info('Worker hosts are: %s' % worker_hosts) cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts}) server = tf.train.Server( {'ps': ps_hosts, 'worker': worker_hosts}, job_name=FLAGS.job_name, task_index=FLAGS.task_id) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec)
def main(_) : dataset = ImagenetData(subset = FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.train_dir) : tf.gfile.DeleteRecursively(FLAGS.train_dir) tf.gfile.MakeDirs(FLAGS.train_dir) inception_train.train(dataset)
def main(unused_argv=None): dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) FLAGS.dataset_name = 'imagenet' FLAGS.num_examples = dataset.num_examples_per_epoch() inception_eval.evaluate(dataset)
def main(_): # Load dataset tf.app.flags.FLAGS.data_dir = '/work/haeusser/data/imagenet/shards' dataset = ImagenetData(subset='validation') assert dataset.data_files() num_labels = dataset.num_classes() + 1 image_shape = [FLAGS.image_size, FLAGS.image_size, 3] graph = tf.Graph() with graph.as_default(): images, labels = image_processing.batch_inputs( dataset, 32, train=True, num_preprocess_threads=16, num_readers=FLAGS.num_readers) # Set up semisup model. model = semisup.SemisupModel(semisup.architectures.inception_model, num_labels, image_shape, test_in=images) # Add moving average variables. for var in tf.get_collection('moving_vars'): tf.add_to_collection(tf.GraphKeys.MOVING_AVERAGE_VARIABLES, var) for var in slim.get_model_variables(): tf.add_to_collection(tf.GraphKeys.MOVING_AVERAGE_VARIABLES, var) # Get prediction tensor from semisup model. predictions = tf.argmax(model.test_logit, 1) # Accuracy metric for summaries. names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), }) for name, value in names_to_values.iteritems(): tf.summary.scalar(name, value) # Run the actual evaluation loop. num_batches = math.ceil(dataset.num_examples_per_epoch() / float(FLAGS.eval_batch_size)) config = tf.ConfigProto() config.gpu_options.allow_growth = True slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.logdir, logdir=FLAGS.logdir, num_evals=num_batches, eval_op=names_to_updates.values(), eval_interval_secs=FLAGS.eval_interval_secs, session_config=config)
def main_fun(argv, ctx): import tensorflow as tf from inception import inception_eval from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) inception_eval.evaluate(dataset)
def main_fun(argv, ctx): import tensorflow as tf from inception import inception_eval from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx) inception_eval.evaluate(dataset)
def main_fun(argv, ctx): # extract node metadata from ctx worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index assert job_name in ['ps', 'worker'], 'job_name must be ps or worker' from inception import inception_distributed_train from inception.imagenet_data import ImagenetData import tensorflow as tf # instantiate FLAGS on workers using argv from driver and add job_name and task_id print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS.job_name = job_name FLAGS.task_id = task_index print("FLAGS:", FLAGS.__dict__['__flags']) # Get TF cluster and server instances cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma) if FLAGS.job_name == 'ps': # `ps` jobs wait for incoming connections from the workers. server.join() else: # `worker` jobs will actually do the work. dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() # Only the chief checks for or creates train_dir. if FLAGS.task_id == 0: if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
def main(_): from inception.imagenet_data import ImagenetData from inception import image_processing dataset = ImagenetData(subset='train') assert dataset.data_files() NUM_LABELS = dataset.num_classes() + 1 IMAGE_SHAPE = [FLAGS.image_size, FLAGS.image_size, 3] graph = tf.Graph() with graph.as_default(): model = semisup.SemisupModel(inception_model, NUM_LABELS, IMAGE_SHAPE) # t_sup_images, t_sup_labels = tools.get_data('train') # t_unsup_images, _ = tools.get_data('unlabeled') images, labels = image_processing.batch_inputs( dataset, 32, train=True, num_preprocess_threads=FLAGS.num_readers, num_readers=FLAGS.num_readers) t_sup_images, t_sup_labels = tf.train.batch( [images, labels], batch_size=FLAGS.sup_batch_size, enqueue_many=True, num_threads=FLAGS.num_readers, capacity=1000 + 3 * FLAGS.sup_batch_size, ) t_unsup_images, t_unsup_labels = tf.train.batch( [images, labels], batch_size=FLAGS.sup_batch_size, enqueue_many=True, num_threads=FLAGS.num_readers, capacity=1000 + 3 * FLAGS.sup_batch_size, ) # Compute embeddings and logits. t_sup_emb = model.image_to_embedding(t_sup_images) t_unsup_emb = model.image_to_embedding(t_unsup_images) t_sup_logit = model.embedding_to_logit(t_sup_emb) # Add losses. model.add_semisup_loss( t_sup_emb, t_unsup_emb, t_sup_labels, visit_weight=FLAGS.visit_weight) model.add_logit_loss(t_sup_logit, t_sup_labels) t_learning_rate = tf.maximum( tf.train.exponential_decay( FLAGS.learning_rate, model.step, FLAGS.decay_steps, FLAGS.decay_factor, staircase=True), FLAGS.minimum_learning_rate) # Create training operation and start the actual training loop. train_op = model.create_train_op(t_learning_rate) config = tf.ConfigProto() config.gpu_options.allow_growth = True slim.learning.train( train_op, logdir=FLAGS.logdir, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, master=FLAGS.master, is_chief=(FLAGS.task == 0), startup_delay_steps=(FLAGS.task * 20), log_every_n_steps=FLAGS.log_every_n_steps, session_config=config)