Exemple #1
0
def main(_):
    #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.pipeline_config_path:
        model_config, train_config, input_config = get_configs_from_pipeline_file(
        )
    else:
        model_config, train_config, input_config = get_configs_from_multiple_files(
        )

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu,
                  ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
Exemple #2
0
 def start(self):
     if self.config is None:
         logger.error('No Config Found')
         return
     train_pipeline_file = self.config['pipeline_config_file']
     configs = self._get_configs_from_pipeline_file(train_pipeline_file)
     model_config = configs['model']
     train_config = configs['train_config']
     input_config = configs['train_input_config']
     logger.info('Building Model')
     model_fn = functools.partial(model_builder.build,
                                  model_config=model_config,
                                  is_training=True)
     logger.info('creating input dict')
     create_input_dict_fn = functools.partial(self.get_next, input_config)
     ps_tasks = 0
     worker_replicas = 1
     worker_job_name = 'obj_detection_trainer'
     task = 0
     is_chief = True
     master = ''
     num_clones = 1
     clone_on_cpu = False
     try:
         logger.info('Training Started')
         trainer.train(create_input_dict_fn, model_fn, train_config, master,
                       task, num_clones, worker_replicas, clone_on_cpu,
                       ps_tasks, worker_job_name, is_chief, self.config)
     except:
         logger.error('Cannot Start Training')
         traceback.print_exc(file=sys.stdout)
Exemple #3
0
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.pipeline_config_path:
    model_config, train_config, input_config = get_configs_from_pipeline_file()
  else:
    model_config, train_config, input_config = get_configs_from_multiple_files()

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  create_input_dict_fn = functools.partial(
      input_reader_builder.build, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks,
                worker_job_name, is_chief, FLAGS.train_dir)
Exemple #4
0
def train_process(model_config,
                  input_config,
                  train_config,
                  train_dir,
                  num_clones=1,
                  clone_on_cpu=False):
    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    # change_process_config(os.getpid())
    total_loss = trainer.train(create_input_dict_fn, model_fn, train_config,
                               master, task, num_clones, worker_replicas,
                               clone_on_cpu, ps_tasks, worker_job_name,
                               is_chief, train_dir)
    return total_loss
    def test_configure_trainer_with_multiclass_scores_and_train_two_steps(
            self):
        train_config_text_proto = """
    optimizer {
      adam_optimizer {
        learning_rate {
          constant_learning_rate {
            learning_rate: 0.01
          }
        }
      }
    }
    data_augmentation_options {
      random_adjust_brightness {
        max_delta: 0.2
      }
    }
    data_augmentation_options {
      random_adjust_contrast {
        min_delta: 0.7
        max_delta: 1.1
      }
    }
    num_steps: 2
    use_multiclass_scores: true
    """
        train_config = train_pb2.TrainConfig()
        text_format.Merge(train_config_text_proto, train_config)

        train_dir = self.get_temp_dir()

        trainer.train(create_tensor_dict_fn=get_input_function,
                      create_model_fn=FakeDetectionModel,
                      train_config=train_config,
                      master='',
                      task=0,
                      num_clones=1,
                      worker_replicas=1,
                      clone_on_cpu=True,
                      ps_tasks=0,
                      worker_job_name='worker',
                      is_chief=True,
                      train_dir=train_dir)
Exemple #6
0
  def test_configure_trainer_with_multiclass_scores_and_train_two_steps(self):
    train_config_text_proto = """
    optimizer {
      adam_optimizer {
        learning_rate {
          constant_learning_rate {
            learning_rate: 0.01
          }
        }
      }
    }
    data_augmentation_options {
      random_adjust_brightness {
        max_delta: 0.2
      }
    }
    data_augmentation_options {
      random_adjust_contrast {
        min_delta: 0.7
        max_delta: 1.1
      }
    }
    num_steps: 2
    use_multiclass_scores: true
    """
    train_config = train_pb2.TrainConfig()
    text_format.Merge(train_config_text_proto, train_config)

    train_dir = self.get_temp_dir()

    trainer.train(create_tensor_dict_fn=get_input_function,
                  create_model_fn=FakeDetectionModel,
                  train_config=train_config,
                  master='',
                  task=0,
                  num_clones=1,
                  worker_replicas=1,
                  clone_on_cpu=True,
                  ps_tasks=0,
                  worker_job_name='worker',
                  is_chief=True,
                  train_dir=train_dir)
Exemple #7
0
 def train():
   return trainer.train(create_tensor_dict_fn=train_input_dict_fn,
                        create_model_fn=train_model_fn,
                        train_config=train_config, master=master, task=task,
                        num_clones=FLAGS.num_clones,
                        worker_replicas=worker_replicas,
                        clone_on_cpu=FLAGS.clone_on_cpu,
                        ps_tasks=parameter_server_tasks,
                        worker_job_name=worker_job_name,
                        is_chief=is_chief, train_dir=FLAGS.train_dir,
                        graph_hook_fn=train_graph_rewriter_fn)
Exemple #8
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    model_config, train_config, input_config = get_configs_from_pipeline_file()

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    total_num_steps = train_config.num_steps
    current_step = FLAGS.eval_every_n_steps
    print("Total number of training steps {}".format(train_config.num_steps))
    print("Evaluation will run every {} steps".format(
        FLAGS.eval_every_n_steps))
    train_config.num_steps = current_step
    while current_step <= total_num_steps:
        print("Training steps # {0}".format(current_step))
        trainer.train(create_input_dict_fn, model_fn, train_config, master,
                      task, FLAGS.num_clones, worker_replicas,
                      FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief,
                      FLAGS.train_dir)
        tf.reset_default_graph()
        evaluate_step()
        tf.reset_default_graph()
        current_step = current_step + FLAGS.eval_every_n_steps
        train_config.num_steps = current_step

    if current_step > FLAGS.eval_every_n_steps:
        train_config.num_steps = total_num_steps
        print("Training steps # {0}".format(train_config.num_steps))
        trainer.train(create_input_dict_fn, model_fn, train_config, master,
                      task, FLAGS.num_clones, worker_replicas,
                      FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief,
                      FLAGS.train_dir)
Exemple #9
0
    def start_training(self):
        """Start training for the model"""
        worker_replicas = 1
        ps_tasks = 0
        clone_on_cpu = False
        num_clones = 1

        ensure_path(config.BASE_MODELS_PATH)
        train_dir = self.train_dir
        model_json_path = os.path.join(train_dir, 'job.json')

        job = self.job
        num_steps = int(job['steps'])

        try:
            if config.DEBUG:
                num_steps = 50
        except AttributeError:
            pass
        except Exception as e:
            _LOGGER.error(e)

        job = api.update_job_state(job, 'training', 'Start training for {} steps'.format(num_steps))

        model = self.model
        ensure_path(config.EXPORTED_MODELS)
        model_graph = os.path.join(config.EXPORTED_MODELS, '{}.pb'.format(model['file_name']))

        if not os.path.exists(os.path.join(train_dir, 'checkpoint')):  # New training started
            _LOGGER.debug("Checkpoints doesn't exists")

            base_checkpoints_path = os.path.join(config.BASE_MODELS_PATH, model['architecture'])
            _tmf = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.isdir(_tmf):
                _LOGGER.debug("Model already exists as %s" % model_graph)
                base_checkpoints_path = _tmf
            elif model['type'] == 'new':
                _LOGGER.debug("model type new")
            else:
                _LOGGER.debug("New model from parent model")
                parent_model = api.get_model(model['parent'])
                if not parent_model:
                    raise Exception('Parent model not found on server')

                parent_tmf = os.path.join(config.TRAINED_MODELS_DATA, parent_model['file_name'])
                if os.path.isdir(parent_tmf):
                    base_checkpoints_path = parent_tmf
                else:
                    _LOGGER.error("Parent model not found. please train it first")
                    return False

            if not os.path.exists(os.path.join(base_checkpoints_path, 'model.ckpt.meta')):
                _LOGGER.debug("Base model not found for %s, Downloading now." % model['architecture'])
                _f = api.download_model_files(model['architecture'])

                tmp_model_data = os.path.join(config.DATA_DIR, 'tmp_model_data')
                if tarfile.is_tarfile(_f):
                    if os.path.exists(tmp_model_data):
                        shutil.rmtree(tmp_model_data)
                    ensure_path(tmp_model_data)
                    print("Tar file found")
                    shutil.unpack_archive(_f, tmp_model_data)
                    for root, dirs, files in os.walk(tmp_model_data):
                        for file in files:
                            if 'model.ckpt' in file:
                                path = os.path.join(root, file)
                                # print(path)
                                ensure_path(base_checkpoints_path)
                                shutil.copy(path, os.path.join(base_checkpoints_path, file))
                else:
                    _LOGGER.error("Invalid file")
                    return False
            if os.path.exists(train_dir):
                shutil.rmtree(train_dir)
            shutil.copytree(base_checkpoints_path, train_dir)
            if os.path.exists(os.path.join(train_dir, 'checkpoint')):
                os.remove(os.path.join(train_dir, 'checkpoint'))

        if os.path.exists(os.path.join(train_dir, 'data')):
            shutil.rmtree(os.path.join(train_dir, 'data'))
        shutil.copytree(self.data_dir, os.path.join(train_dir, 'data'))

        counts = {'train': 0, 'test': 1000, 'classes': 1}
        stats_file = os.path.join(train_dir, "data", "stats.json")
        try:
            with open(stats_file) as _f:
                counts = json.load(_f)
        except:
            pass

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')
        if not os.path.exists(pipeline_config_path):
            pipeline_config_path = os.path.join(self.configs_dir, "{}.config".format(model['architecture']))
        task = '0'
        if task == '0':
            tf.gfile.MakeDirs(train_dir)
        if pipeline_config_path:
            _LOGGER.info("Pipeline config file : {}".format(pipeline_config_path))
            configs = config_util.get_configs_from_pipeline_file(
                pipeline_config_path)
            if task == '0':
                tf.gfile.Copy(pipeline_config_path,
                              os.path.join(train_dir, 'pipeline.config'),
                              overwrite=True)
        else:
            _LOGGER.error("No config found")
            return False

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')

        # with open(model_json_path, 'w') as mf:
        #     json.dump(job, mf)

        model_config = configs['model']
        train_config = configs['train_config']
        input_config = configs['train_input_config']


        if model_config.HasField('faster_rcnn'):
            model_config.faster_rcnn.num_classes = counts['classes']

        if model_config.HasField('ssd'):
            model_config.ssd.num_classes = counts['classes']

        # Set num_steps
        train_config.num_steps = num_steps
        train_config.fine_tune_checkpoint = os.path.join(train_dir, 'model.ckpt')

        # Update input config to use updated list of input
        input_config.tf_record_input_reader.ClearField('input_path')
        input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "train_baheads.tfrecord-??????"))
        input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        eval_config = configs['eval_config']
        eval_input_config = configs['eval_input_config']

        eval_config.num_examples = counts['test']
        eval_config.max_evals = 1

        # Update input config to use updated list of input
        eval_input_config.tf_record_input_reader.ClearField('input_path')
        eval_input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "test_baheads.tfrecord-??????"))
        eval_input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        # Save the updated config to pipeline file
        config_util.save_pipeline_config(config_util.create_pipeline_proto_from_configs({
            'model': model_config,
            'train_config': train_config,
            'train_input_config': input_config,
            'eval_config': eval_config,
            'eval_input_config': eval_input_config

        }), train_dir)

        model_fn = functools.partial(
            model_builder.build,
            model_config=model_config,
            is_training=True)

        def get_next(config):
            return dataset_builder.make_initializable_iterator(
                dataset_builder.build(config)).get_next()

        create_input_dict_fn = functools.partial(get_next, input_config)

        env = json.loads(os.environ.get('TF_CONFIG', '{}'))
        cluster_data = env.get('cluster', None)
        cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
        task_data = env.get('task', None) or {'type': 'master', 'index': 0}
        task_info = type('TaskSpec', (object,), task_data)

        # Parameters for a single worker.
        ps_tasks = 0
        worker_replicas = 1
        worker_job_name = 'lonely_worker'
        task = 0
        is_chief = True
        master = ''

        if cluster_data and 'worker' in cluster_data:
            # Number of total worker replicas include "worker"s and the "master".
            worker_replicas = len(cluster_data['worker']) + 1
        if cluster_data and 'ps' in cluster_data:
            ps_tasks = len(cluster_data['ps'])

        if worker_replicas > 1 and ps_tasks < 1:
            raise ValueError('At least 1 ps task is needed for distributed training.')

        if worker_replicas >= 1 and ps_tasks > 0:
            # Set up distributed training.
            server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                     job_name=task_info.type,
                                     task_index=task_info.index)
            if task_info.type == 'ps':
                server.join()
                return

            worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
            task = task_info.index
            is_chief = (task_info.type == 'master')
            master = server.target

        graph_rewriter_fn = None
        if 'graph_rewriter_config' in configs:
            graph_rewriter_fn = graph_rewriter_builder.build(
                configs['graph_rewriter_config'], is_training=True)

        if not os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            status_timer = StatusThread(tfh, num_steps, job)
            status_timer.start()
            try:
                trainer.train(
                    create_input_dict_fn,
                    model_fn,
                    train_config,
                    master,
                    task,
                    num_clones,
                    worker_replicas,
                    clone_on_cpu,
                    ps_tasks,
                    worker_job_name,
                    is_chief,
                    train_dir,
                    graph_hook_fn=graph_rewriter_fn)
            except KeyboardInterrupt:
                raise
            finally:
                status_timer.stop()
                if status_timer.is_alive():
                    _LOGGER.info("Waiting for status thread to close")
                    status_timer.join()

        if os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            # Training complete. Export model
            _LOGGER.debug("Training complete for %d steps" % num_steps)
            job = api.update_job_state(job, 'training', 'Training complete')
            export_path = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.exists(export_path):
                shutil.rmtree(export_path)
            ckpt_path = os.path.join(train_dir, 'model.ckpt-{}'.format(num_steps))
            exporter.export(pipeline_config_path, export_path, ckpt_path)

            frozen_graph = os.path.join(export_path, 'frozen_inference_graph.pb')

            if os.path.exists(frozen_graph):  # Successfully exported
                shutil.copy(frozen_graph, model_graph)
                shutil.copy(
                    os.path.join(train_dir, 'data', "labels.pbtxt"),
                    os.path.join(config.EXPORTED_MODELS, '{}.pbtxt'.format(model['file_name']))
                )
                # TODO: Eval the trained graph, Push the result to server.
                eval_dir = 'eval_dir'
                tf.reset_default_graph()
                eval_result = run_eval(train_dir, eval_dir, pipeline_config_path, counts['test'])
                if 'PascalBoxes_Precision/[email protected]' in eval_result:
                    acc = eval_result['PascalBoxes_Precision/[email protected]'] * 100
                    _LOGGER.info("PascalBoxes_Precision/[email protected] : %d %%" % (acc))
                    job = api.update_job_state(job, 'complete', 'PascalBoxes_Precision %d %%' % (acc))
                _LOGGER.info(eval_result)
                if os.path.exists(train_dir):
                    shutil.rmtree(train_dir)
                return True

        return False
Exemple #10
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.pipeline_config_path:
        model_config, train_config, input_config = get_configs_from_pipeline_file(
        )  #we use this function cz we supply a config file
    else:
        model_config, train_config, input_config = get_configs_from_multiple_files(
        )

#Here the funct tool will make the build function with some parameters alread filled in . Kind of sub function of the original
    model_fn = functools.partial(  #create the model with the parameters provided by the config file and 
        model_builder.build,
        model_config=model_config,
        is_training=True)

    #Now it's time to create the input pipeline or place holders

    create_input_dict_fn = functools.partial(  #Now creating the input feed with the protobuf 
        input_reader_builder.build, input_config)

    ##################################################################################################################################
    #regarding the distributed training set up
    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target


#end of the distributed training thing
#############################################################################################################################################

    trainer.train(
        create_input_dict_fn,
        model_fn,
        train_config,
        master,
        task,  #call a function inorder to train 
        FLAGS.num_clones,
        worker_replicas,
        FLAGS.clone_on_cpu,
        ps_tasks,
        worker_job_name,
        is_chief,
        FLAGS.train_dir)
Exemple #11
0
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    if FLAGS.task == 0:
      tf.gfile.Copy(FLAGS.pipeline_config_path,
                    os.path.join(FLAGS.train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        train_config_path=FLAGS.train_config_path,
        train_input_config_path=FLAGS.input_config_path)
    if FLAGS.task == 0:
      for name, config in [('model.config', FLAGS.model_config_path),
                           ('train.config', FLAGS.train_config_path),
                           ('input.config', FLAGS.input_config_path)]:
        tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  def get_next(config):
    return dataset_util.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      FLAGS.num_clones,
      worker_replicas,
      FLAGS.clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      FLAGS.train_dir,
      graph_hook_fn=graph_rewriter_fn)
Exemple #12
0
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.export_model:
    assert FLAGS.pipeline_config_path, '`pipeline_config_path` is required if exporting model'
    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
    with tf.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f:
      text_format.Merge(f.read(), pipeline_config)
  if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    if FLAGS.task == 0:
      tf.gfile.Copy(FLAGS.pipeline_config_path,
                    os.path.join(FLAGS.train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        train_config_path=FLAGS.train_config_path,
        train_input_config_path=FLAGS.input_config_path)
    if FLAGS.task == 0:
      for name, config in [('model.config', FLAGS.model_config_path),
                           ('train.config', FLAGS.train_config_path),
                           ('input.config', FLAGS.input_config_path)]:
        tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  create_input_dict_fn = functools.partial(
      input_reader_builder.build, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks,
                worker_job_name, is_chief, FLAGS.train_dir)

  if FLAGS.export_model:
    latest_ckpt = tf.train.latest_checkpoint(FLAGS.train_dir)
    exporter.export_inference_graph(FLAGS.input_type, pipeline_config,
                                    latest_ckpt, FLAGS.saved_model_output_dir, FLAGS.input_shape)
  if cluster_data and 'worker' in cluster_data:
    
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
   
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks,
                worker_job_name, is_chief, FLAGS.train_dir)


if __name__ == '__main__':
  tf.app.run()
Exemple #14
0
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) # tf.gfile模块创建一个目录
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)#读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`,
    #`train_input_config`, `eval_config`, `eval_input_config`信息
    if FLAGS.task == 0:
      tf.gfile.Copy(FLAGS.pipeline_config_path,
                    os.path.join(FLAGS.train_dir, 'pipeline.config'),
                    overwrite=True) #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        train_config_path=FLAGS.train_config_path,
        train_input_config_path=FLAGS.input_config_path)#读取model_config_path、train_config_path、train_input_config_path的路径
    if FLAGS.task == 0:
      for name, config in [('model.config', FLAGS.model_config_path),
                           ('train.config', FLAGS.train_config_path),
                           ('input.config', FLAGS.input_config_path)]:
        tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  """"
  以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 
  通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码,
  包含ssd,fast_rcnn等众多模型代码,部分代码如下所示
  def build(model_config, is_training):
      if not isinstance(model_config, model_pb2.DetectionModel):
          raise ValueError('model_config not of type model_pb2.DetectionModel.')
      # 获取配置中的模型种类
      meta_architecture = model_config.WhichOneof('model')
      # 进行具体加载
      if meta_architecture == 'ssd':
          return _build_ssd_model(model_config.ssd, is_training)
      if meta_architecture == 'faster_rcnn':
          return _build_faster_rcnn_model(model_config.faster_rcnn, is_training)
      raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
      以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数
      之后说明每一个子模型的构建,比如image_resizer_builder的构建
      """

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)
  #第二阶段中的参数配置
  def get_next(config):
    return dataset_util.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)
  #python解码JSON对象
  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      FLAGS.num_clones,
      worker_replicas,
      FLAGS.clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      FLAGS.train_dir,
      graph_hook_fn=graph_rewriter_fn)
Exemple #15
0
def main(train_dir, pipeline_config_path, train_config_path="", input_config_path="",
         model_config_path="", master="", task=0, num_clones=1, 
         clone_on_cpu=False, worker_replicas=1, ps_tasks=0):
  """
  DEFINE_string('master', '', 'Name of the TensorFlow master to use.')
  DEFINE_integer('task', 0, 'task id')
  DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.')
  DEFINE_boolean('clone_on_cpu', False,
                       'Force clones to be deployed on CPU.  Note that even if '
                       'set to False (allowing ops to run on gpu), some ops may '
                       'still be run on the CPU if they have no GPU kernel.')
  DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer '
                       'replicas.')
  DEFINE_integer('ps_tasks', 0,
                       'Number of parameter server tasks. If None, does not use '
                       'a parameter server.')
  DEFINE_string('train_dir', '',
                      'Directory to save the checkpoints and training summaries.')

  DEFINE_string('pipeline_config_path', '',
                      'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
                      'file. If provided, other configs are ignored')

  DEFINE_string('train_config_path', '',
                      'Path to a train_pb2.TrainConfig config file.')
  DEFINE_string('input_config_path', '',
                      'Path to an input_reader_pb2.InputReader config file.')
  DEFINE_string('model_config_path', '',
                      'Path to a model_pb2.DetectionModel config file.')
  """
  tf.logging.set_verbosity(tf.logging.INFO)

  if task == 0: tf.gfile.MakeDirs(train_dir)
  if pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        pipeline_config_path)
    if task == 0:
      tf.gfile.Copy(pipeline_config_path,
                    os.path.join(train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=model_config_path,
        train_config_path=train_config_path,
        train_input_config_path=input_config_path)
    if task == 0:
      for name, config in [('model.config', model_config_path),
                           ('train.config', train_config_path),
                           ('input.config', input_config_path)]:
        tf.gfile.Copy(config, os.path.join(train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  def get_next(config):
    return dataset_builder.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  print("\n\n\n\n\nMADE IT HERE\n\n\n\n\n\n")

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      num_clones,
      worker_replicas,
      clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      train_dir,
      graph_hook_fn=graph_rewriter_fn)
  print("MADE IT THERE")
def main(_):
    if FLAGS.train_label:
        FLAGS.pipeline_config_path = '../configs/test/' + FLAGS.train_label + '.config'
        FLAGS.train_dir = '../checkpoints/train/' + FLAGS.train_label
        FLAGS.train_tag = FLAGS.train_label

    if FLAGS.pipeline_config_dir_path:
        model_configs, train_configs, input_configs, eval_configs, eval_input_configs = get_configs_from_dir(
        )
    else:
        total_configs = get_configs_from_pipeline_file()
        if FLAGS.pipeline_config_path:
            model_config, train_config, input_config, eval_config, eval_input_config = total_configs
        else:
            model_config, train_config, input_config = total_configs

    if not FLAGS.train_dir:
        root_dir = utils.get_tempdir()
        dataset = os.path.basename(
            input_config.label_map_path).split('_')[0].upper()
        tempfile.tempdir = utils.mkdir_p(os.path.join(root_dir, dataset))
        meta_architecture = model_config.WhichOneof('model')
        model_name = meta_architecture.upper()
        tempfile.tempdir = utils.mkdir_p(
            os.path.join(tempfile.tempdir, model_name))
        if meta_architecture == 'ssd':
            meta_config = model_config.ssd
        elif meta_architecture == 'faster_rcnn':
            meta_config = model_config.faster_rcnn
        else:
            raise ValueError(
                'Unknown meta architecture: {}'.format(meta_architecture))
        feature_extractor = meta_config.feature_extractor.type
        backbone_name = feature_extractor.replace(meta_architecture,
                                                  '').lstrip('_').upper()
        tempfile.tempdir = utils.mkdir_p(
            os.path.join(tempfile.tempdir, backbone_name))

        train_prefix = "small-%s-" % time.strftime("%Y%m%d-%H%M%S")
        FLAGS.train_dir = tempfile.mkdtemp(suffix="-" + FLAGS.train_tag,
                                           prefix=train_prefix)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)

    # Save configuration
    def _save_config(config, prefix):
        config_str = text_format.MessageToString(config)
        save_path = os.path.join(FLAGS.train_dir, prefix + '.config')
        with open(save_path, 'w') as f:
            f.write(config_str)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    if not FLAGS.pipeline_config_dir_path:
        # Not consecutive training
        _save_config(model_config, 'model')
        _save_config(train_config, 'train')
        _save_config(input_config, 'train_input')
        if FLAGS.pipeline_config_path:
            _save_config(eval_config, 'eval')
            _save_config(eval_input_config, 'eval_input')

        model_fn = functools.partial(model_builder.build,
                                     model_config=model_config,
                                     is_training=True)

        create_input_dict_fn = functools.partial(input_reader_builder.build,
                                                 input_config)
        num_examples = sum(1 for _ in tf.python_io.tf_record_iterator(
            input_config.tf_record_input_reader.input_path))

        trainer.train(create_input_dict_fn,
                      model_fn,
                      train_config,
                      master,
                      task,
                      FLAGS.num_clones,
                      worker_replicas,
                      FLAGS.clone_on_cpu,
                      ps_tasks,
                      worker_job_name,
                      is_chief,
                      FLAGS.train_dir,
                      num_examples,
                      total_configs=total_configs,
                      model_config=model_config)
    else:
        # Consecutive training
        num_of_configs = len(model_configs)

        for config_index in range(num_of_configs):
            model_config = model_configs[config_index]
            train_config = train_configs[config_index]
            input_config = input_configs[config_index]
            eval_config = eval_configs[config_index]
            eval_input_config = eval_input_configs[config_index]
            total_configs = (model_config, train_config, input_config,
                             eval_config, eval_input_config)

            _save_config(model_config, 'model')
            _save_config(train_config, 'train')
            _save_config(input_config, 'train_input')
            _save_config(eval_config, 'eval')
            _save_config(eval_input_config, 'eval_input')

            model_fn = functools.partial(model_builder.build,
                                         model_config=model_config,
                                         is_training=True)

            create_input_dict_fn = functools.partial(
                input_reader_builder.build, input_config)
            num_examples = sum(1 for _ in tf.python_io.tf_record_iterator(
                input_config.tf_record_input_reader.input_path))

            trainer.train(
                create_input_dict_fn,
                model_fn,
                train_config,
                master,
                task,
                FLAGS.num_clones,
                worker_replicas,
                FLAGS.clone_on_cpu,
                ps_tasks,
                worker_job_name,
                is_chief,
                FLAGS.train_dir,
                num_examples,
                total_configs=total_configs,
                is_first_training=(True if config_index == 0 else False))

            def _is_last_training():
                return config_index == num_of_configs - 1

            if _is_last_training():
                break

            # Remove all the files except events files in train_dir for the next training.
            for f in os.listdir(FLAGS.train_dir):
                path_to_file = os.path.join(FLAGS.train_dir, f)
                if os.path.isfile(path_to_file) and not f.startswith('events'):
                    os.remove(path_to_file)
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    # env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    # cluster_data = env.get('cluster', None)
    # cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    # task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    # task_info = type('TaskSpec', (object,), task_data)

    # # Parameters for a single worker.
    # ps_tasks = 0
    # worker_replicas = 1
    # worker_job_name = 'lonely_worker'
    # task = 0
    # is_chief = True
    # master = ''

    # cluster_data, my_job_name, my_task_index = tf_config_from_slurm(ps_number=1)

    parameter_servers = ["localhost:2232"]
    workers = ["localhost:2233", "localhost:2234", "localhost:2235"]
    cluster_data = {"ps": parameter_servers, "worker": workers}

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker'])
        print("Number of replicas: ", worker_replicas)
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])
        print("Number of ps tasks: ", ps_tasks)

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster_data),
                                 protocol='grpc',
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        if FLAGS.job_name == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (FLAGS.job_name, FLAGS.task_index)
        task = FLAGS.task_index
        is_chief = (FLAGS.task_index == 0)
        master = server.target
        print("worker_job_name: ", worker_job_name)
        print("task: ", task)
        print("is_chief: ", is_chief)
        print("master: ", master)

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu,
                  ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
Exemple #18
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_util.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    print("%s" % str(env))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)
    print("cluster_data %s" % str(cluster_data))
    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        try:
            print("tf.train.Server")
            server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                     protocol='grpc',
                                     job_name=task_info.type,
                                     task_index=task_info.index)
        except KeyboardInterrupt:
            print("ctrl c END")
        if task_info.type == 'ps':
            print("ps")
            try:
                print("tf.Session")
                sess = tf.Session(server.target)
                print("create_done_queue: " + str(worker_replicas))
                queue = create_done_queue(task_info.index, worker_replicas,
                                          ps_tasks)

                # wait until all workers are done
                for i in range(worker_replicas):
                    sess.run(queue.dequeue())
                    print("ps %d received done %d" % (task_info.index, i))

                print("ps %d: quitting" % (task_info.index))
                # server.join()
                return
            except KeyboardInterrupt:
                print("ctrl c END")

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target
    print("is_chief:" + str(is_chief))

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    try:
        trainer.train(create_input_dict_fn,
                      model_fn,
                      train_config,
                      master,
                      task,
                      FLAGS.num_clones,
                      worker_replicas,
                      FLAGS.clone_on_cpu,
                      ps_tasks,
                      worker_job_name,
                      is_chief,
                      FLAGS.train_dir,
                      graph_hook_fn=graph_rewriter_fn)
    except KeyboardInterrupt:
        print("ctrl c END1")
    finally:
        if worker_replicas >= 1 and ps_tasks > 0:
            print("tf.Session")
            sess = tf.Session(server.target)
            print("end create_done_queues:" + str(worker_replicas))
            for q in create_done_queues(worker_replicas, ps_tasks):
                print("enqueue")
                sess.run(q.enqueue(1))
def main(_):
    print("starting program . . .")

    # show info to std out during the training process
    tf.logging.set_verbosity(tf.logging.INFO)

    if not checkIfNecessaryPathsAndFilesExist():
        return
    # end if

    configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH)
    tf.gfile.Copy(PIPELINE_CONFIG_PATH,
                  os.path.join(TRAINING_DATA_DIR, 'pipeline.config'),
                  overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    # ToDo: this nested function seems odd, factor this out eventually ??
    # nested function
    def get_next(config):
        return dataset_util.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    # end nested function

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # parameters for a single worker
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    # end if

    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])
    # end if

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')
    # end if

    if worker_replicas >= 1 and ps_tasks > 0:
        # set up distributed training
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return
        # end if

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target
    # end if

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks,
                  worker_job_name, is_chief, TRAINING_DATA_DIR)
Exemple #20
0
def main(_):
    if iswindos():
        FLAGS.train_dir = winprefix + FLAGS.train_dir
        FLAGS.pipeline_config_path = FLAGS.pipeline_config_path + "_win"
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.pipeline_config_path:
        model_config, train_config, input_config = get_configs_from_pipeline_file(
        )
    else:
        model_config, train_config, input_config = get_configs_from_multiple_files(
        )
    print("[main]: model_config:", model_config)
    print("[main]: train_config:", train_config)
    print("[main]: input_config:", input_config)
    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    print("[main]: model_fn:", model_fn)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)
    print("[main]: create_input_dict_fn:", create_input_dict_fn)
    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)
    print("[main]: cluster_data:", cluster_data)
    print("[main]: cluster:", cluster)
    print("[main]: task_data:", task_data)
    print("[main]: task_info:", task_info)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    print("[main]: create_input_dict_fn:", create_input_dict_fn)
    print("[main]: model_fn:", model_fn)
    print("[main]: train_config:", train_config)
    print("[main]: master:", master)
    print("[main]: task:", task)
    print("[main]: FLAGS.num_clones:", FLAGS.num_clones)
    print("[main]: worker_replicas:", worker_replicas)
    print("[main]: FLAGS.clone_on_cpu:", FLAGS.clone_on_cpu)
    print("[main]: ps_tasks:", ps_tasks)
    print("[main]: worker_job_name:", worker_job_name)
    print("[main]: is_chief:", is_chief)

    print("[main]: train_dir:", FLAGS.train_dir)

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu,
                  ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
Exemple #21
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_util.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index,
                                 config=config)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu,
                  ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
     TASK, PS_TASKS, TRAIN_DIR, PIPELINE_CONFING_PATH, TRAIN_CONFIG_PATH,
     MODEL_CONFIG_PATH, INPUT_CONFIG_PATH, WORKER_REPLICAS, MASTER)

train_config.num_steps = NUM_EPOCHS

##################################################
# TRAIN THE MODEL #
##################################################

trainer.train(create_input_dict_fn,
              model_fn,
              train_config,
              master,
              task,
              NUM_CLONES,
              worker_replicas,
              CLONE_ON_CPU,
              ps_tasks,
              worker_job_name,
              is_chief,
              TRAIN_DIR,
              graph_hook_fn=graph_rewriter_fn)

######################################
# EXPORT THE MODEL #
######################################

# Exporting the model for Evaluation
from google.protobuf import text_format
from object_detection import exporter
from object_detection.protos import pipeline_pb2