def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) evaluator.evaluate( create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, graph_hook_fn=graph_rewriter_fn)
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' if FLAGS.gpu: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) categories = label_map_util.create_categories_from_labelmap( input_config.label_map_path) if FLAGS.run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, graph_hook_fn=graph_rewriter_fn)
def get_configs_from_multiple_files(model_config_path="", train_config_path="", train_input_config_path="", eval_config_path="", eval_input_config_path="", lstm_config_path=""): """Reads training configuration from multiple config files. Args: model_config_path: Path to model_pb2.DetectionModel. train_config_path: Path to train_pb2.TrainConfig. train_input_config_path: Path to input_reader_pb2.InputReader. eval_config_path: Path to eval_pb2.EvalConfig. eval_input_config_path: Path to input_reader_pb2.InputReader. lstm_config_path: Path to pipeline_pb2.LstmModel. Returns: Dictionary of configuration objects. Keys are `model`, `train_config`, `train_input_config`, `eval_config`, `eval_input_config`, `lstm_model`. Key/Values are returned only for valid (non-empty) strings. """ configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=train_input_config_path, eval_config_path=eval_config_path, eval_input_config_path=eval_input_config_path) if lstm_config_path: lstm_config = internal_pipeline_pb2.LstmModel() with tf.gfile.GFile(lstm_config_path, "r") as f: text_format.Merge(f.read(), lstm_config) configs["lstm_model"] = lstm_config return configs
def get_configs_from_multiple_files(model_config_path="", train_config_path="", train_input_config_path="", eval_config_path="", eval_input_config_path="", lstm_config_path=""): """Reads training configuration from multiple config files. Args: model_config_path: Path to model_pb2.DetectionModel. train_config_path: Path to train_pb2.TrainConfig. train_input_config_path: Path to input_reader_pb2.InputReader. eval_config_path: Path to eval_pb2.EvalConfig. eval_input_config_path: Path to input_reader_pb2.InputReader. lstm_config_path: Path to pipeline_pb2.LstmModel. Returns: Dictionary of configuration objects. Keys are `model`, `train_config`, `train_input_config`, `eval_config`, `eval_input_config`, `lstm_model`. Key/Values are returned only for valid (non-empty) strings. """ configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=train_input_config_path, eval_config_path=eval_config_path, eval_input_config_path=eval_input_config_path) if lstm_config_path: lstm_config = internal_pipeline_pb2.LstmModel() with tf.gfile.GFile(lstm_config_path, "r") as f: text_format.Merge(f.read(), lstm_config) configs["lstm_model"] = lstm_config return configs
def main(argv): del argv required_flags = ['input_config_path', 'eval_config_path', 'eval_dir'] for flag_name in required_flags: if not getattr(FLAGS, flag_name): raise ValueError('Flag --{} is required'.format(flag_name)) configs = config_util.get_configs_from_multiple_files( eval_input_config_path=FLAGS.input_config_path, eval_config_path=FLAGS.eval_config_path) eval_config = configs['eval_config'] input_config = configs['eval_input_config'] metrics, pr_values = read_data_and_evaluate(input_config, eval_config) # Save metrics write_metrics(metrics, FLAGS.eval_dir) # Additional: Write Precision and recall values (array) # Get the number of categories: category_list = pr_values.keys() for cat in category_list: precisions = pr_values[cat]['precisions'] recalls = pr_values[cat]['recalls'] precision_file = os.path.join(FLAGS.eval_dir, '{}_{}'.format(cat, 'precisons.txt')) recall_file = os.path.join(FLAGS.eval_dir, '{}_{}'.format(cat, 'recalls.txt')) with open(precision_file, 'w') as f: for item in precisions: f.write("%s\n" % item) with open(recall_file, 'w') as f: for item in recalls: f.write("%s\n" % item)
def main(unused_argv): # Use the following lines to potentially restrict the training process to only 30% of the GPU V-RAM #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy( FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) categories = label_map_util.create_categories_from_labelmap( input_config.label_map_path) if FLAGS.run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) evaluator.evaluate( create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, graph_hook_fn=graph_rewriter_fn)
def main(unused_argv): if (FLAGS.omp > 0): if not os.environ.get("OMP_NUM_THREADS"): logging.info('OMP_NUM_THREADS value= %d', FLAGS.omp) os.environ["OMP_NUM_THREADS"] = str(FLAGS.omp) if not os.environ.get("KMP_BLOCKTIME"): logging.info('KMP_BLOCKTIME value= %d', FLAGS.blocktime) os.environ["KMP_BLOCKTIME"] = str(FLAGS.blocktime) if not os.environ.get("KMP_SETTINGS"): os.environ["KMP_SETTINGS"] = "1" # os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0" assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, intra_op=FLAGS.intra_op, inter_op=FLAGS.inter_op)
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 #evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, # FLAGS.checkpoint_dir, FLAGS.eval_dir) evaluator.evaluate(FLAGS.report_filename, create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir) #writing the sorted report and best model statistics df = pd.read_csv(FLAGS.report_filename) maxval = df.loc[df['mean_ap'].idxmax()] df1 = df.sort_values('model-iter') names = FLAGS.report_filename.split('/') dir_path = '' for i in range(len(names) - 1): if i > 0: dir_path = dir_path + '/' + names[i] df1.to_csv(dir_path + '/sorted-' + names[-1]) maxval.to_csv(dir_path + '/best' + names[-1])
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.mini_batch: input_config.shuffle = True model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 if FLAGS.mini_batch: eval_config.max_evals = 1 eval_config.num_visualizations = 100 eval_config.num_examples = 100 eval_config.visualization_export_dir = os.path.join( FLAGS.eval_dir, 'images') os.makedirs(eval_config.visualization_export_dir, exist_ok=True) metrics = evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir) process_metrics(metrics)
def test_get_configs_from_multiple_files(self): """Tests that proto configs can be read from multiple files.""" print( '\n==========================================================================' ) print('test_get_configs_from_multiple_files') #temp_dir = self.get_temp_dir() temp_dir = '/home/zq/tmp/' # Write model config file. model_config_path = os.path.join(temp_dir, "model.config") model = model_pb2.DetectionModel() model.faster_rcnn.num_classes = 10 _write_config(model, model_config_path) # Write train config file. train_config_path = os.path.join(temp_dir, "train.config") train_config = train_pb2.TrainConfig() train_config.batch_size = 32 _write_config(train_config, train_config_path) # Write train input config file. train_input_config_path = os.path.join(temp_dir, "train_input.config") train_input_config = input_reader_pb2.InputReader() train_input_config.label_map_path = "path/to/label_map" _write_config(train_input_config, train_input_config_path) # Write eval config file. eval_config_path = os.path.join(temp_dir, "eval.config") eval_config = eval_pb2.EvalConfig() eval_config.num_examples = 20 _write_config(eval_config, eval_config_path) # Write eval input config file. eval_input_config_path = os.path.join(temp_dir, "eval_input.config") eval_input_config = input_reader_pb2.InputReader() eval_input_config.label_map_path = "path/to/another/label_map" _write_config(eval_input_config, eval_input_config_path) configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=train_input_config_path, eval_config_path=eval_config_path, eval_input_config_path=eval_input_config_path) self.assertProtoEquals(model, configs["model"]) self.assertProtoEquals(train_config, configs["train_config"]) self.assertProtoEquals(train_input_config, configs["train_input_config"]) self.assertProtoEquals(eval_config, configs["eval_config"]) self.assertProtoEquals(eval_input_config, configs["eval_input_config"])
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] else: input_config = configs['eval_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) print(label_map) #max_num_classes = max([item.id for item in label_map.item]) max_num_classes = 764 categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir)
def test_get_configs_from_multiple_files(self): """Tests that proto configs can be read from multiple files.""" temp_dir = self.get_temp_dir() # Write model config file. model_config_path = os.path.join(temp_dir, "model.config") model = model_pb2.DetectionModel() model.faster_rcnn.num_classes = 10 _write_config(model, model_config_path) # Write train config file. train_config_path = os.path.join(temp_dir, "train.config") train_config = train_config = train_pb2.TrainConfig() train_config.batch_size = 32 _write_config(train_config, train_config_path) # Write train input config file. train_input_config_path = os.path.join(temp_dir, "train_input.config") train_input_config = input_reader_pb2.InputReader() train_input_config.label_map_path = "path/to/label_map" _write_config(train_input_config, train_input_config_path) # Write eval config file. eval_config_path = os.path.join(temp_dir, "eval.config") eval_config = eval_pb2.EvalConfig() eval_config.num_examples = 20 _write_config(eval_config, eval_config_path) # Write eval input config file. eval_input_config_path = os.path.join(temp_dir, "eval_input.config") eval_input_config = input_reader_pb2.InputReader() eval_input_config.label_map_path = "path/to/another/label_map" _write_config(eval_input_config, eval_input_config_path) configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=train_input_config_path, eval_config_path=eval_config_path, eval_input_config_path=eval_input_config_path) self.assertProtoEquals(model, configs["model"]) self.assertProtoEquals(train_config, configs["train_config"]) self.assertProtoEquals(train_input_config, configs["train_input_config"]) self.assertProtoEquals(eval_config, configs["eval_config"]) self.assertProtoEquals(eval_input_config, configs["eval_input_config"])
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] else: input_config = configs['eval_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) categories = [] categories.append({'id': 1, 'name': 'Daisy'}) if FLAGS.run_once: eval_config.max_evals = 1 evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir) print "Hello WOrld!"
def main(argv): del argv required_flags = ['input_config_path', 'eval_config_path', 'eval_dir'] for flag_name in required_flags: if not getattr(FLAGS, flag_name): raise ValueError('Flag --{} is required'.format(flag_name)) configs = config_util.get_configs_from_multiple_files( eval_input_config_path=FLAGS.input_config_path, eval_config_path=FLAGS.eval_config_path) eval_config = configs['eval_config'] input_config = configs['eval_input_configs'][0] metrics = read_data_and_evaluate(input_config, eval_config) # Save metrics write_metrics(metrics, FLAGS.eval_dir)
def main(argv): del argv required_flags = ['input_config_path', 'eval_config_path', 'eval_dir'] for flag_name in required_flags: if not getattr(FLAGS, flag_name): raise ValueError('Flag --{} is required'.format(flag_name)) configs = config_util.get_configs_from_multiple_files( eval_input_config_path=FLAGS.input_config_path, eval_config_path=FLAGS.eval_config_path) eval_config = configs['eval_config'] input_config = configs['eval_input_config'] metrics = read_data_and_evaluate(input_config, eval_config) # Save metrics write_metrics(metrics, FLAGS.eval_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial( input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(pipeline_config_path, checkpoint_dir, eval_dir, eval_training_data=False, eval_config_path="", input_config_path="", model_config_path="", run_once=False): """ DEFINE_boolean('eval_training_data', False, 'If training data should be evaluated for this job.') DEFINE_string('checkpoint_dir', '', 'Directory containing checkpoints to evaluate, typically ' 'set to `train_dir` used in the training job.') DEFINE_string('eval_dir', '', 'Directory to write eval summaries to.') DEFINE_string('pipeline_config_path', '', 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' 'file. If provided, other configs are ignored') DEFINE_string('eval_config_path', '', 'Path to an eval_pb2.EvalConfig config file.') DEFINE_string('input_config_path', '', 'Path to an input_reader_pb2.InputReader config file.') DEFINE_string('model_config_path', '', 'Path to a model_pb2.DetectionModel config file.') DEFINE_boolean('run_once', False, 'Option to only run a single pass of ' 'evaluation. Overrides the `max_evals` parameter in the ' 'provided config.') """ tf.logging.set_verbosity(tf.logging.INFO) tf.gfile.MakeDirs(eval_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) tf.gfile.Copy(pipeline_config_path, os.path.join(eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, eval_config_path=eval_config_path, eval_input_config_path=input_config_path) for name, config in [('model.config', model_config_path), ('eval.config', eval_config_path), ('input.config', input_config_path)]: tf.gfile.Copy(config, os.path.join(eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) evaluator.evaluate( create_input_dict_fn, model_fn, eval_config, categories, checkpoint_dir, eval_dir, graph_hook_fn=graph_rewriter_fn)
def main(unused_argv): # Uncomment the next lines on Linux to run the evaluation on the CPU # config = tf.ConfigProto( # device_count={'GPU': 0} # ) # sess = tf.Session(config=config) # Uncomment the next line on Windows to run the evaluation on the CPU # os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Use the following lines to restrict this process to only 30% of the GPU VRAM gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) print("%s" % str(env)) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) print("cluster_data %s" % str(cluster_data)) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. try: print("tf.train.Server") server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) except KeyboardInterrupt: print("ctrl c END") if task_info.type == 'ps': print("ps") try: print("tf.Session") sess = tf.Session(server.target) print("create_done_queue: " + str(worker_replicas)) queue = create_done_queue(task_info.index, worker_replicas, ps_tasks) # wait until all workers are done for i in range(worker_replicas): sess.run(queue.dequeue()) print("ps %d received done %d" % (task_info.index, i)) print("ps %d: quitting" % (task_info.index)) # server.join() return except KeyboardInterrupt: print("ctrl c END") worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target print("is_chief:" + str(is_chief)) graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) try: trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn) except KeyboardInterrupt: print("ctrl c END1") finally: if worker_replicas >= 1 and ps_tasks > 0: print("tf.Session") sess = tf.Session(server.target) print("end create_done_queues:" + str(worker_replicas)) for q in create_done_queues(worker_replicas, ps_tasks): print("enqueue") sess.run(q.enqueue(1))
def train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps): os.makedirs(to_save_path, exist_ok=True) if len(os.listdir(to_save_path)) == 0: do_xml_to_csv(to_save_path, dataset_path) generate_config(base_config_path, to_save_path, num_steps) generate_tfrecords(base_config_path, to_save_path, dataset_path) checkpoint_path = "{}/training".format(to_save_path) config_file_path = "{}/config/faster_rcnn_inception_v2_pets.config".format( to_save_path) tf.logging.set_verbosity(tf.logging.INFO) flags = tf.app.flags flags.DEFINE_string('f', '', 'kernel') flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.') flags.DEFINE_integer('task', 0, 'task id') flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.') flags.DEFINE_boolean( 'clone_on_cpu', False, 'Force clones to be deployed on CPU. Note that even if ' 'set to False (allowing ops to run on gpu), some ops may ' 'still be run on the CPU if they have no GPU kernel.') flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer ' 'replicas.') flags.DEFINE_integer( 'ps_tasks', 0, 'Number of parameter server tasks. If None, does not use ' 'a parameter server.') flags.DEFINE_string( 'train_dir', '{}'.format(checkpoint_path), 'Directory to save the checkpoints and training summaries.') flags.DEFINE_string( 'pipeline_config_path', '{}'.format(config_file_path), 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' 'file. If provided, other configs are ignored') flags.DEFINE_string('train_config_path', '', 'Path to a train_pb2.TrainConfig config file.') flags.DEFINE_string( 'input_config_path', '', 'Path to an input_reader_pb2.InputReader config file.') flags.DEFINE_string('model_config_path', '', 'Path to a model_pb2.DetectionModel config file.') FLAGS = flags.FLAGS assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) # get TF logger log = logging.getLogger('tensorflow') log.setLevel(logging.INFO) # create formatter and add it to the handlers formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # create file handler which logs even debug messages fh = logging.FileHandler('{}training/training.log'.format(to_save_path)) fh.setLevel(logging.INFO) fh.setFormatter(formatter) log.addHandler(fh) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn) # # tf.app.run(main=train_tensorflow_object_detection_api) # base_config_path = '/home/aniruddh/lincode/product/livis-develop_v2/gpu_q/tf_training/' # to_save_path = '/home/aniruddh/lincode/product/livis-develop_v2/experiments/tensorflow/t2/' # dataset_path = '/home/aniruddh/lincode/product/images/' # num_steps = 15000 # train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps) # os.makedirs(to_save_path, exist_ok=True) # do_xml_to_csv(to_save_path, dataset_path) # config_path = generate_config(base_config_path, to_save_path, num_steps) # generate_tfrecords(base_config_path, to_save_path, dataset_path) # train_tf_model(base_config_path, to_save_path, dataset_path, num_steps)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) #iterator = dataset_util.make_initializable_iterator(dataset_builder.build(input_config)) datasetmy = dataset_builder.build(input_config) iterator = datasetmy.make_initializable_iterator() def get_next(config): return iterator.get_next() create_input_dict_fn = functools.partial(get_next, input_config) data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options] input_queue = trainer.create_input_queue( train_config.batch_size, create_input_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) tensors = input_queue.dequeue() #print all tensors in tfrecord print(tensors) groundtruth_difficult = tensors[0]['groundtruth_difficult'] groundtruth_group_of = tensors[0]['groundtruth_group_of'] groundtruth_weights = tensors[0]['groundtruth_weights'] groundtruth_is_crowd = tensors[0]['groundtruth_is_crowd'] key = tensors[0]['key'] groundtruth_boxes = tensors[0]['groundtruth_boxes'] image = tensors[0]['image'] groundtruth_area = tensors[0]['groundtruth_area'] groundtruth_classes = tensors[0]['groundtruth_classes'] filename = tensors[0]['filename'] num_groundtruth_boxes = tensors[0]['num_groundtruth_boxes'] source_id = tensors[0]['source_id'] init_op=tf.initialize_all_variables() with tf.Session() as sess: sess.run(iterator.initializer) sess.run(tf.tables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) sess.run(init_op) for i in range(10): groundtruth_weights_val,groundtruth_difficult_val,groundtruth_group_of_val,groundtruth_is_crowd_val,key_val,groundtruth_boxes_val,image_val,groundtruth_area_val,groundtruth_classes_val,filename_val,num_groundtruth_boxes_val,source_id_val = \ sess.run([groundtruth_weights,groundtruth_difficult,groundtruth_group_of,groundtruth_is_crowd,key,groundtruth_boxes,image,groundtruth_area,groundtruth_classes,filename,num_groundtruth_boxes,source_id]) # print(groundtruth_weights_val) print(groundtruth_boxes_val) # print(groundtruth_difficult_val) # print(groundtruth_group_of_val) # print(groundtruth_is_crowd_val) # print(key_val) # print(image_val) # print(groundtruth_area_val) print(groundtruth_classes_val) print(filename_val) print(num_groundtruth_boxes_val) # print(source_id_val) image_val = image_val[0] image_val = image_val.astype(np.uint8) # cv2.imshow('image', image_val) # cv2.waitKey() # plt.imshow(image_val) # plt.show() print('finish') #plot bbox on image plt.switch_backend("TkAgg") classes_val = groundtruth_classes_val boxes_val = groundtruth_boxes_val scores_val = [1.0]*num_groundtruth_boxes_val image_np = image_val image_np_origin = image_val.copy() NUM_CLASSES = 90 IMAGE_SIZE = (12, 8) PATH_TO_LABELS = '../../data/mscoco_label_map.pbtxt' label_map = label_map_util.load_labelmap(PATH_TO_LABELS) categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True) category_index = label_map_util.create_category_index(categories) vis_util.visualize_boxes_and_labels_on_image_array( image_np, boxes_val, np.squeeze(classes_val).astype(np.int32), np.squeeze(scores_val), category_index, use_normalized_coordinates=True, line_thickness=8) plt.figure(figsize=IMAGE_SIZE) plt.subplot(121) plt.imshow(image_np) plt.subplot(122) plt.imshow(image_np_origin) plt.show() print('finish') pass coord.request_stop() coord.join(threads)
def main(train_dir, pipeline_config_path, train_config_path="", input_config_path="", model_config_path="", master="", task=0, num_clones=1, clone_on_cpu=False, worker_replicas=1, ps_tasks=0): """ DEFINE_string('master', '', 'Name of the TensorFlow master to use.') DEFINE_integer('task', 0, 'task id') DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.') DEFINE_boolean('clone_on_cpu', False, 'Force clones to be deployed on CPU. Note that even if ' 'set to False (allowing ops to run on gpu), some ops may ' 'still be run on the CPU if they have no GPU kernel.') DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer ' 'replicas.') DEFINE_integer('ps_tasks', 0, 'Number of parameter server tasks. If None, does not use ' 'a parameter server.') DEFINE_string('train_dir', '', 'Directory to save the checkpoints and training summaries.') DEFINE_string('pipeline_config_path', '', 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' 'file. If provided, other configs are ignored') DEFINE_string('train_config_path', '', 'Path to a train_pb2.TrainConfig config file.') DEFINE_string('input_config_path', '', 'Path to an input_reader_pb2.InputReader config file.') DEFINE_string('model_config_path', '', 'Path to a model_pb2.DetectionModel config file.') """ tf.logging.set_verbosity(tf.logging.INFO) if task == 0: tf.gfile.MakeDirs(train_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if task == 0: tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=input_config_path) if task == 0: for name, config in [('model.config', model_config_path), ('train.config', train_config_path), ('input.config', input_config_path)]: tf.gfile.Copy(config, os.path.join(train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) print("\n\n\n\n\nMADE IT HERE\n\n\n\n\n\n") trainer.train( create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn) print("MADE IT THERE")
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) wait_time = 300 while wait_time > 0: latest_checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_checkpoint: num_steps = latest_checkpoint.split('-')[-1] if int(num_steps) > 0: wait_time = 0 if wait_time > 0: tf.logging.info("waiting for checkpoint...") time.sleep(wait_time) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] do_augmentation = False if input_config.WhichOneof('input_reader') == 'tf_record_input_reader': input_reader_config = input_config.tf_record_input_reader input_path = input_reader_config.input_path if not input_path or not input_path[0]: do_augmentation = True train_input_config = configs['train_input_config'] train_input_reader_config = train_input_config.tf_record_input_reader input_reader_config.input_path[:] = train_input_reader_config.input_path[:] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, do_augmentation=do_augmentation)
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) categories = label_map_util.create_categories_from_labelmap( input_config.label_map_path) if FLAGS.run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) metrics_dict = evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, graph_hook_fn=graph_rewriter_fn) with open(FLAGS.output_json_path, 'w') as op_json_file: temp_dict = {} for key, value in metrics_dict.items(): temp_dict[key] = str(value) json.dump(temp_dict, op_json_file)
def main(unused_argv): assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' assert FLAGS.eval_dir, '`eval_dir` is missing.' tf.gfile.MakeDirs(FLAGS.eval_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, eval_config_path=FLAGS.eval_config_path, eval_input_config_path=FLAGS.input_config_path) for name, config in [('model.config', FLAGS.model_config_path), ('eval.config', FLAGS.eval_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] if FLAGS.eval_training_data: input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=False) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) if FLAGS.run_once: eval_config.max_evals = 1 logging.basicConfig(level=logging.INFO) if FLAGS.evaluate_with_anchors: tflite_outputs = None if FLAGS.evaluate_with_run_tflite: assert FLAGS.tensorflow_dir, '`tensorflow_dir` is missing.' assert FLAGS.tflite_outputs, '`tflite_outputs` is missing.' qouts = FLAGS.tflite_outputs.split(',') if len(qouts) % 4 > 0: assert FLAGS.tflite_outputs, '`tflite_outputs` is not given right with size.' tflite_outputs = [(qouts[i * 4], qouts[i * 4 + 1], float(qouts[i * 4 + 2]), int(qouts[i * 4 + 3])) for i in range(len(qouts) // 4)] evaluator.evaluate_with_anchors( create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir, evaluate_with_run_tflite=FLAGS.evaluate_with_run_tflite, quantize=FLAGS.quantize, tflite_outputs=tflite_outputs, tensorflow_dir=FLAGS.tensorflow_dir) else: evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories, FLAGS.checkpoint_dir, FLAGS.eval_dir)
def get_train_config(task, ps_tasks, train_dir, pipeline_config_path, train_config_path, model_config_path, input_CONFIG_PATH, worker_replicas, master): """Set variables for training. """ # Create the folder where to store the models if task == 0: tf.gfile.MakeDirs(train_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if task == 0: tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=input_CONFIG_PATH) if TASK == 0: for name, config in [('model.config', model_config_path), ('train.config', train_config_path), ('input.config', input_CONFIG_PATH)]: tf.gfile.Copy(config, os.path.join(train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. worker_job_name = 'lonely_worker' is_chief = True if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) return(create_input_dict_fn, model_fn, train_config, master, task, worker_job_name, ps_tasks, worker_replicas, is_chief, graph_rewriter_fn)
def get_eval_config(eval_dir, pipeline_config_path, eval_config_path, model_config_path, eval_input_config_path, eval_training_data, run_once): """Set variables for evaluating. """ tf.gfile.MakeDirs(eval_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) tf.gfile.Copy(pipeline_config_path, os.path.join(eval_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, eval_config_path=eval_config_path, eval_input_config_path=eval_input_config_path) for name, config in [('model.config', model_config_path), ('eval.config', eval_config_path), ('input.config', eval_input_config_path)]: tf.gfile.Copy(config, os.path.join(eval_dir, name), overwrite=True) model_config = configs['model'] eval_config = configs['eval_config'] input_config = configs['eval_input_config'] # Evaluation on Training Data if eval_training_data: input_config = configs['train_input_config'] # Build model for Evaluation model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=False) label_map = label_map_util.load_labelmap(input_config.label_map_path) max_num_classes = max([item.id for item in label_map.item]) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes) # Input fn create_input_dict_fn = functools.partial(get_next, input_config) # Single evaluation step if run_once: eval_config.max_evals = 1 graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=False) return(create_input_dict_fn, model_fn, eval_config, categories, graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] #print("DEBUG: Nitish Step 2 ") model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) #print("DEBUG: env: " + str(env)) ## env=json.loads( '{"cluster": {"master": ["instance-1:8000"], "ps": ["instance-2:8000"], "worker": ["instance-3:8000"]}, "task": {"type": "master", "index": 0}, "model_dir": "", "environment": "local"}' ) #print("DEBUG: env: " + str(env)) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) """ print("DEBUG: Nitish Step 3 ") print("DEBUG: FLAGS.num_clones: " + str(FLAGS.num_clones) ) print("DEBUG: str(FLAGS.clone_on_cpu) " + str(FLAGS.clone_on_cpu) ) print("DEBUG: str(ps_tasks) " + str(ps_tasks)) print("DEBUG: str(worker_job_name) " + str(worker_job_name)) print("DEBUG: str(is_chief) " + str(is_chief)) print("DEBUG: str(master) " + str(master)) print("DEBUG: str(task) " + str(task)) print("DEBUG: str(model_fn) " + str(model_fn)) print("DEBUG: str(train_config) " + str(train_config)) """ trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) # env = json.loads(os.environ.get('TF_CONFIG', '{}')) # cluster_data = env.get('cluster', None) # cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None # task_data = env.get('task', None) or {'type': 'master', 'index': 0} # task_info = type('TaskSpec', (object,), task_data) # # Parameters for a single worker. # ps_tasks = 0 # worker_replicas = 1 # worker_job_name = 'lonely_worker' # task = 0 # is_chief = True # master = '' # cluster_data, my_job_name, my_task_index = tf_config_from_slurm(ps_number=1) parameter_servers = ["localhost:2232"] workers = ["localhost:2233", "localhost:2234", "localhost:2235"] cluster_data = {"ps": parameter_servers, "worker": workers} if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) print("Number of replicas: ", worker_replicas) if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) print("Number of ps tasks: ", ps_tasks) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster_data), protocol='grpc', job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == 'ps': server.join() return worker_job_name = '%s/task:%d' % (FLAGS.job_name, FLAGS.task_index) task = FLAGS.task_index is_chief = (FLAGS.task_index == 0) master = server.target print("worker_job_name: ", worker_job_name) print("task: ", task) print("is_chief: ", is_chief) print("master: ", master) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.export_model: assert FLAGS.pipeline_config_path, '`pipeline_config_path` is required if exporting model' pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() with tf.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f: text_format.Merge(f.read(), pipeline_config) if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial( input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir) if FLAGS.export_model: latest_ckpt = tf.train.latest_checkpoint(FLAGS.train_dir) exporter.export_inference_graph(FLAGS.input_type, pipeline_config, latest_ckpt, FLAGS.saved_model_output_dir, FLAGS.input_shape)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(): datasetmy = get_debug_dataset() # iterator = datasetmy.make_one_shot_iterator() # tensors = iterator.get_next() # dataset = datasetmy.batch(1) for batch in tfe.Iterator(datasetmy): # print(batch) # return batch return batch['image'], batch['groundtruth_boxes'], batch[ 'groundtruth_classes'], None, None create_input_dict_fn = get_next env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target print(tf.test.is_gpu_available()) #False ?????? print(tf.test.gpu_device_name()) # with tf.device('/gpu:0'):#GPUs specified manually is required in eager excution model trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
FLAGS = flags.FLAGS assert train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(train_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config,
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index, config=config) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) # tf.gfile模块创建一个目录 if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path ) #读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`, #`train_input_config`, `eval_config`, `eval_input_config`信息 if FLAGS.task == 0: tf.gfile.Copy( FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True ) #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path ) #读取model_config_path、train_config_path、train_input_config_path的路径 if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] """" 以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码, 包含ssd,fast_rcnn等众多模型代码,部分代码如下所示 def build(model_config, is_training): if not isinstance(model_config, model_pb2.DetectionModel): raise ValueError('model_config not of type model_pb2.DetectionModel.') # 获取配置中的模型种类 meta_architecture = model_config.WhichOneof('model') # 进行具体加载 if meta_architecture == 'ssd': return _build_ssd_model(model_config.ssd, is_training) if meta_architecture == 'faster_rcnn': return _build_faster_rcnn_model(model_config.faster_rcnn, is_training) raise ValueError('Unknown meta architecture: {}'.format(meta_architecture)) 以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数 之后说明每一个子模型的构建,比如image_resizer_builder的构建 """ model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) #第二阶段中的参数配置 def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) #python解码JSON对象 env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)