Example #1
0
    def __init__(self, role, bridge, data_path, ext,
                 worker_rank=0, num_workers=1):
        self._role = role
        self._bridge = bridge
        self._num_workers = num_workers
        self._worker_rank = worker_rank

        self._tm_role = 'follower' if role == 'leader' else 'leader'

        if data_path:
            files = None
            if not tf.io.gfile.isdir(data_path):
                files = [os.path.basename(data_path)]
                data_path = os.path.dirname(data_path)
            self._trainer_master = LocalTrainerMasterClient(
                self._tm_role, data_path, files=files, ext=ext)
        else:
            self._trainer_master = None

        self._count = 0
        if self._role == 'leader':
            self._block_queue = queue.Queue()
            self._bridge.register_data_block_handler(self._data_block_handler)
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.send(
                self._bridge.current_iter_id, 'barrier', np.asarray([1]))
            self._bridge.commit()
        elif self._role == 'follower':
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.receive(self._bridge.current_iter_id, 'barrier')
            self._bridge.commit()
Example #2
0
def _run_local(role,
               args,
               input_fn,
               model_fn,
               serving_input_receiver_fn,
               export_model_hook=None):
    if not args.local_addr:
        raise ValueError("local-addr is required")
    if not args.peer_addr:
        raise ValueError("peer-addr is required")
    mode = args.mode.lower()

    cluster_spec = _create_cluster_spec(args)
    cluster_server = ClusterServer(cluster_spec, "local")

    # run master
    checkpoint_filename_with_path = _get_checkpoint_filename_with_path(args)
    data_visitor = _create_data_visitor(args)
    master_factory = LeaderTrainerMaster \
        if role == LEADER else FollowerTrainerMaster
    local_master = master_factory(
        cluster_server,
        data_visitor,
        mode,
        model_fn,
        input_fn,
        serving_input_receiver_fn,
        checkpoint_filename_with_path,
        checkpoint_path=args.checkpoint_path,
        save_checkpoint_steps=args.save_checkpoint_steps,
        save_checkpoint_secs=args.save_checkpoint_secs,
        summary_path=args.summary_path,
        summary_save_steps=args.summary_save_steps,
        summary_save_secs=args.summary_save_secs,
        export_path=args.export_path,
        sparse_estimator=args.sparse_estimator,
        export_model_hook=export_model_hook)
    master_thread = threading.Thread(target=local_master.run_forever)
    master_thread.setDaemon(True)
    master_thread.start()

    # run worker
    trainer_master = LocalTrainerMasterClient(local_master, 0)
    if not trainer_master.worker_register():
        return
    bridge = Bridge(role, int(args.local_addr.split(':')[1]), args.peer_addr,
                    args.application_id, 0)

    estimator_factory = \
        SparseFLEstimator if args.sparse_estimator else FLEstimator
    estimator = estimator_factory(cluster_server, trainer_master, bridge, role,
                                  model_fn)

    if mode == 'train':
        estimator.train(input_fn)
    elif mode == 'eval':
        estimator.evaluate(input_fn)

    trainer_master.worker_complete(bridge.terminated_at)
    trainer_master.wait_master_complete()
Example #3
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                               args.peer_addr)

    if args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps': cluster_spec['clusterSpec']['PS'],
            'worker': {args.worker_rank: args.tf_addr}})

    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(
            args.master_addr, role, args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {args.worker_rank: args.tf_addr}})
    elif args.data_path:
        trainer_master = LocalTrainerMasterClient(role, args.data_path)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    estimator = FLEstimator(
        model_fn, bridge, trainer_master, role, worker_rank=args.worker_rank,
        cluster_spec=cluster_spec)
    if args.checkpoint_path:
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps)
    else:
        estimator.train(input_fn)

    if args.export_path:
        estimator.export_saved_model(args.export_path,
                                    serving_input_receiver_fn,
                                    checkpoint_path=args.checkpoint_path)
Example #4
0
class DataBlockLoader(object):
    def __init__(self,
                 role,
                 bridge,
                 data_path,
                 ext,
                 worker_rank=0,
                 num_workers=1,
                 output_path=None):
        self._role = role
        self._bridge = bridge
        self._num_workers = num_workers
        self._worker_rank = worker_rank
        self._output_path = output_path

        self._tm_role = 'follower' if role == 'leader' else 'leader'

        if data_path:
            files = None
            if not tf.io.gfile.isdir(data_path):
                files = [os.path.basename(data_path)]
                data_path = os.path.dirname(data_path)
            self._trainer_master = LocalTrainerMasterClient(self._tm_role,
                                                            data_path,
                                                            files=files,
                                                            ext=ext)
        else:
            self._trainer_master = None

        self._count = 0
        if self._role == 'leader':
            self._block_queue = queue.Queue()
            self._bridge.register_data_block_handler(self._data_block_handler)
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.send(self._bridge.current_iter_id, 'barrier',
                              np.asarray([1]))
            self._bridge.commit()
        elif self._role == 'follower':
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.receive(self._bridge.current_iter_id, 'barrier')
            self._bridge.commit()

    def _data_block_handler(self, msg):
        logging.debug('DataBlock: recv "%s" at %d', msg.block_id, msg.count)
        assert self._count == msg.count
        if not msg.block_id:
            block = None
        elif self._trainer_master is not None:
            block = self._trainer_master.request_data_block(msg.block_id)
            return False
        else:
            block = DataBlockInfo(msg.block_id, None)
        self._count += 1
        self._block_queue.put(block)
        return True

    def _request_data_block(self):
        while True:
            for _ in range(self._worker_rank):
                self._trainer_master.request_data_block()
            block = self._trainer_master.request_data_block()
            for _ in range(self._num_workers - self._worker_rank - 1):
                self._trainer_master.request_data_block()
            if block is None or self._output_path is None or \
                    not tf.io.gfile.exists(os.path.join(
                        self._output_path, block.block_id) + '.output'):
                break
        return block

    def get_next_block(self):
        if self._role == 'local':
            return self._request_data_block()

        if self._tm_role == 'leader':
            while True:
                block = self._request_data_block()
                if block is not None:
                    if not self._bridge.load_data_block(
                            self._count, block.block_id):
                        continue
                else:
                    self._bridge.load_data_block(self._count, '')
                break
            self._count += 1
        else:
            block = self._block_queue.get()
        return block
Example #5
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    logging.basicConfig(
        format="%(asctime)-15s [%(filename)s:%(lineno)d] " \
               "%(levelname)s : %(message)s")
    if args.verbosity == 0:
        logging.getLogger().setLevel(logging.WARNING)
    elif args.verbosity == 1:
        logging.getLogger().setLevel(logging.INFO)
    elif args.verbosity > 1:
        logging.getLogger().setLevel(logging.DEBUG)

    if args.application_id:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, args.worker_rank)
    else:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr)

    if args.data_path:
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_path,
                                                  epoch_num=args.epoch_num)
        if args.ps_addrs is not None:
            ps_addrs = args.ps_addrs.split(",")
            cluster_spec = tf.train.ClusterSpec({
                'ps': ps_addrs,
                'worker': {
                    args.worker_rank: args.tf_addr
                }
            })
        else:
            cluster_spec = None
    elif args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps':
            cluster_spec['clusterSpec']['PS'],
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(args.master_addr, role,
                                             args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.data_source:
        if args.start_time is None or args.end_time is None:
            raise ValueError(
                "data source must be set with start-date and end-date")
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_source,
                                                  start_time=args.start_time,
                                                  end_time=args.end_time,
                                                  epoch_num=args.epoch_num)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    if args.summary_path:
        SummaryHook.summary_path = args.summary_path
        SummaryHook.worker_rank = args.worker_rank
        SummaryHook.role = role
    if args.summary_save_steps:
        SummaryHook.save_steps = args.summary_save_steps

    if args.sparse_estimator:
        estimator = SparseFLEstimator(model_fn,
                                      bridge,
                                      trainer_master,
                                      role,
                                      worker_rank=args.worker_rank,
                                      application_id=args.application_id,
                                      cluster_spec=cluster_spec)
    else:
        estimator = FLEstimator(model_fn,
                                bridge,
                                trainer_master,
                                role,
                                worker_rank=args.worker_rank,
                                application_id=args.application_id,
                                cluster_spec=cluster_spec)

    run_mode = args.mode.lower()
    if run_mode == 'train':
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps,
                        save_checkpoint_secs=args.save_checkpoint_secs)
        if args.export_path and args.worker_rank == 0:
            export_path = '%s/%d' % (args.export_path, bridge.terminated_at)
            estimator.export_saved_model(export_path,
                                         serving_input_receiver_fn,
                                         checkpoint_path=args.checkpoint_path)
            fsuccess = tf.io.gfile.GFile('%s/_SUCCESS' % export_path, 'w')
            fsuccess.write('%d' % bridge.terminated_at)
            fsuccess.close()

    elif run_mode == 'eval':
        estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path)
    else:
        raise ValueError('Allowed values are: --mode=train|eval')
Example #6
0
def train(role, args, input_fn, model_fn, serving_input_receiver_fn):
    if args.application_id:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr, args.application_id, args.worker_rank)
    else:
        bridge = Bridge(role, int(args.local_addr.split(':')[1]),
                        args.peer_addr)

    if args.data_path:
        trainer_master = LocalTrainerMasterClient(role, args.data_path)
        if args.ps_addrs is not None:
            ps_addrs = args.ps_addrs.split(",")
            cluster_spec = tf.train.ClusterSpec({
                'ps': ps_addrs,
                'worker': {
                    args.worker_rank: args.tf_addr
                }
            })
        else:
            cluster_spec = None
    elif args.cluster_spec:
        cluster_spec = json.loads(args.cluster_spec)
        assert 'clusterSpec' in cluster_spec, \
            "cluster_spec do not meet legal format"
        assert 'Master' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Master"
        assert isinstance(cluster_spec['clusterSpec']['Master'], list), \
            "Master must be list"
        assert 'Worker' in cluster_spec['clusterSpec'],\
            "cluster_spec must include Worker"
        assert isinstance(cluster_spec['clusterSpec']['Worker'], list), \
            "Worker must be list"
        trainer_master = TrainerMasterClient(
            cluster_spec['clusterSpec']['Master'][0], role, args.worker_rank)
        cluster_spec = tf.train.ClusterSpec({
            'ps':
            cluster_spec['clusterSpec']['PS'],
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.master_addr:
        assert args.tf_addr is not None, \
            "--tf-addr must be set when master_addr is set."
        trainer_master = TrainerMasterClient(args.master_addr, role,
                                             args.worker_rank)
        ps_addrs = args.ps_addrs.split(",")
        cluster_spec = tf.train.ClusterSpec({
            'ps': ps_addrs,
            'worker': {
                args.worker_rank: args.tf_addr
            }
        })
    elif args.data_source:
        if args.start_time is None or args.end_time is None:
            raise ValueError(
                "data source must be set with start-date and end-date")
        trainer_master = LocalTrainerMasterClient(role,
                                                  args.data_source,
                                                  start_time=args.start_time,
                                                  end_time=args.end_time)
        cluster_spec = None
    else:
        raise ValueError("Either --master-addr or --data-path must be set")

    if args.summary_path:
        SummaryHook.summary_path = args.summary_path
        SummaryHook.worker_rank = args.worker_rank
        SummaryHook.role = role
    if args.summary_save_steps:
        SummaryHook.save_steps = args.summary_save_steps

    if args.sparse_estimator:
        estimator = SparseFLEstimator(model_fn,
                                      bridge,
                                      trainer_master,
                                      role,
                                      worker_rank=args.worker_rank,
                                      cluster_spec=cluster_spec)
    else:
        estimator = FLEstimator(model_fn,
                                bridge,
                                trainer_master,
                                role,
                                worker_rank=args.worker_rank,
                                cluster_spec=cluster_spec)

    run_mode = args.mode.lower()
    if run_mode == 'train':
        estimator.train(input_fn,
                        checkpoint_path=args.checkpoint_path,
                        save_checkpoint_steps=args.save_checkpoint_steps,
                        save_checkpoint_secs=args.save_checkpoint_secs)
    elif run_mode == 'eval':
        estimator.evaluate(input_fn, checkpoint_path=args.checkpoint_path)
    else:
        raise ValueError('Allowed values are: --mode=train|eval')

    if args.export_path:
        estimator.export_saved_model(args.export_path,
                                     serving_input_receiver_fn,
                                     checkpoint_path=args.checkpoint_path)
Example #7
0
class DataBlockLoader(object):
    def __init__(self, role, bridge, data_path, ext,
                 worker_rank=0, num_workers=1):
        self._role = role
        self._bridge = bridge
        self._num_workers = num_workers
        self._worker_rank = worker_rank

        self._tm_role = 'follower' if role == 'leader' else 'leader'

        if data_path:
            files = None
            if not tf.io.gfile.isdir(data_path):
                files = [os.path.basename(data_path)]
                data_path = os.path.dirname(data_path)
            self._trainer_master = LocalTrainerMasterClient(
                self._tm_role, data_path, files=files, ext=ext)
        else:
            self._trainer_master = None

        self._count = 0
        if self._role == 'leader':
            self._block_queue = queue.Queue()
            self._bridge.register_data_block_handler(self._data_block_handler)
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.send(
                self._bridge.current_iter_id, 'barrier', np.asarray([1]))
            self._bridge.commit()
        elif self._role == 'follower':
            self._bridge.start(self._bridge.new_iter_id())
            self._bridge.receive(self._bridge.current_iter_id, 'barrier')
            self._bridge.commit()

    def _data_block_handler(self, msg):
        logging.debug('DataBlock: recv "%s" at %d', msg.block_id, msg.count)
        assert self._count == msg.count
        if not msg.block_id:
            block = None
        elif self._trainer_master is not None:
            block = self._trainer_master.request_data_block(msg.block_id)
            if block is None:
                raise ValueError("Block %s not found" % msg.block_id)
        else:
            block = DataBlockInfo(msg.block_id, None)
        self._count += 1
        self._block_queue.put(block)

    def _request_data_block(self):
        for _ in range(self._worker_rank):
            self._trainer_master.request_data_block()
        block = self._trainer_master.request_data_block()
        for _ in range(self._num_workers - self._worker_rank - 1):
            self._trainer_master.request_data_block()
        return block

    def get_next_block(self):
        if self._role == 'local':
            return self._request_data_block()

        if self._tm_role == 'leader':
            while True:
                block = self._request_data_block()
                if block is not None:
                    try:
                        self._bridge.load_data_block(self._count,
                                                     block.block_id)
                    except Exception as e:  # pylint: disable=broad-except
                        logging.error('load data block %s with error: %s',
                                      block.block_id, repr(e))
                        continue
                else:
                    self._bridge.load_data_block(self._count, '')
                break
            self._count += 1
        else:
            block = self._block_queue.get()
        return block