Example #1
0
    def __init__(self, temperature=0.1, use_beam=False, port=9502, name=None):
        """ Constructor
            :param temperature: The temperature to apply to the logits.
            :param use_beam: Boolean that indicates that we want to use a beam search.
            :param port: The port to use for the tf serving to query the model.
            :param name: Optional. The name of this player.
        """
        model_url = 'https://f002.backblazeb2.com/file/ppaquette-public/benchmarks/neurips2019-rl_model.zip'

        # Creating serving if port is not open
        if not is_port_opened(port):
            launch_serving(model_url, port)

        # Creating adapter
        grpc_dataset = GRPCDataset(
            hostname='localhost',
            port=port,
            model_name='player',
            signature=rl_neurips2019.PolicyAdapter.get_signature(),
            dataset_builder=rl_neurips2019.BaseDatasetBuilder())
        policy_adapter = rl_neurips2019.PolicyAdapter(grpc_dataset)

        # Building benchmark model
        super(DipNetRLPlayer, self).__init__(policy_adapter=policy_adapter,
                                             temperature=temperature,
                                             use_beam=use_beam,
                                             name=name)
Example #2
0
def build_train_server(trainer):
    """ Builds the Tensorflow tf.train.Server
        :param trainer: A reinforcement learning trainer instance.
        :type trainer: diplomacy_research.models.training.reinforcement.trainer.ReinforcementTrainer
    """
    if not trainer.cluster_config:
        return

    from diplomacy_research.utils.tensorflow import tf
    task_address = trainer.cluster_config.cluster_spec[
        trainer.cluster_config.job_name][trainer.cluster_config.task_id]

    # Making port is not used by another process
    task_port = int(task_address.split(':')[1])
    LOGGER.info('Killing any processes that have port %d opened.', task_port)
    kill_processes_using_port(task_port)

    # Starting server
    LOGGER.info('Starting server with task id %d - Address: %s',
                trainer.cluster_config.task_id, task_address)
    LOGGER.info('Checking if port %d is already opened: %s', task_port,
                str(is_port_opened(task_port)))
    trainer.server = tf.train.Server(tf.train.ClusterSpec(
        trainer.cluster_config.cluster_spec),
                                     job_name=trainer.cluster_config.job_name,
                                     task_index=trainer.cluster_config.task_id,
                                     protocol=trainer.cluster_config.protocol)
    LOGGER.info(
        'Server successfully started. Trying to contact other nodes...')
Example #3
0
def task_launch_serving(port,
                        save_dir,
                        force_cpu,
                        config,
                        cluster_config=None):
    """ Launches (or restarts) a TF serving server
        :param port: Integer. The port to use for the TF serving server.
        :param save_dir: The current flags.save_dir
        :param force_cpu: Launches the tf serving on CPU, otherwise uses GPU when using distributed training.
        :param config: The configuration to set on the serving on launch (None to set no config on launch)
        :param cluster_config: The cluster configuration used for distributed training.
    """
    kill_processes_using_port(port)
    kill_subprocesses_on_exit()

    # Computing launch settings
    simult_players = multiprocessing.cpu_count(
    ) * 7  # launching nb_cpus processes
    max_batch_size = 2**int(math.log(
        0.9 * simult_players, 2))  # rounding down to nearest exponent of 2.
    batch_timeout = 200000
    batching_parameters = BatchingParameters(
        max_batch_size=max_batch_size,
        batch_timeout_micros=batch_timeout,
        max_enqueued_batches=256,
        num_batch_threads=multiprocessing.cpu_count(),
        pad_variable_length_inputs=True)

    log_file_path = None
    if not cluster_config:
        log_file_path = os.path.join(save_dir, 'tf_serving_%d.log' % port)
    elif get_tf_serving_port(cluster_config, serving_id=0) != port:
        log_file_path = os.path.join(
            save_dir, 'tf_serving_%s_%d.log' % (cluster_config.job_name, port))
    force_cpu = force_cpu or bool(cluster_config is None)

    # Launching
    tf_serving_thread = Thread(target=start_tf_serving,
                               args=(port, save_dir, batching_parameters,
                                     cluster_config),
                               kwargs={
                                   'force_cpu': force_cpu,
                                   'log_file_path': log_file_path
                               })
    tf_serving_thread.start()

    # Waiting for port
    for _ in range(120):
        if is_port_opened(port):
            break
        yield gen.sleep(1)
    else:
        LOGGER.error('Unable to connect to TF Serving after 2 mins.')

    # Setting configuration
    yield task_set_config(port, config)
Example #4
0
def launch_serving(model_name, serving_port):
    """ Launches or relaunches the TF Serving process """
    # Stop all serving child processes
    if is_port_opened(serving_port):
        kill_processes_using_port(serving_port)

    # Launching a new process
    log_file_path = os.path.join(WORKING_DIR, 'data',
                                 'log_serving_%d.txt' % serving_port)
    serving_process = Process(target=start_tf_serving,
                              args=(serving_port, WORKING_DIR),
                              kwargs={
                                  'force_cpu': True,
                                  'log_file_path': log_file_path
                              })
    serving_process.start()
    kill_subprocesses_on_exit()

    # Waiting for port to be opened.
    for attempt_idx in range(30):
        time.sleep(10)
        if is_port_opened(serving_port):
            break
        LOGGER.info('Waiting for TF Serving to come online. - Attempt %d / %d',
                    attempt_idx + 1, 30)
    else:
        LOGGER.error('TF Serving is not online after 5 minutes. Aborting.')
        raise RuntimeError()

    # Setting configuration
    new_config = ModelConfig(name='player',
                             base_path='/work_dir/data/bot_%s' % model_name,
                             version_policy=None)
    for _ in range(30):
        if GRPCDataset.set_config('localhost', serving_port, new_config):
            LOGGER.info('Configuration set successfully.')
            break
        time.sleep(5.)
    else:
        LOGGER.error('Unable to set the configuration file.')
Example #5
0
    def __init__(self,
                 hostname,
                 port,
                 model_name,
                 signature,
                 dataset_builder,
                 cluster_config=None,
                 connect_timeout=300,
                 timeout=30,
                 nb_retries=100):
        """ Constructor
            :param hostname: The hostname of the TensorFlow Serving server
            :param port: The port used by the TensorFlow Serving server.
            :param model_name: The name of the model being served by the TensorFlow Serving server.
            :param signature: The output of adapter.get_signature() - signature of all the possible calls
            :param dataset_builder: An instance of `BaseBuilder` containing the proto-fields and generation methods
            :param cluster_config: Optional. If set, the cluster configuration will be used for distributed training.
            :param connect_timeout: The timeout to try to connect to the TF serving server.
            :param timeout: The timeout (in seconds) to wait for a request
            :param nb_retries: The number of retries in case of grpc.RpcError before giving up.
            :type dataset_builder: diplomacy_research.models.datasets.base_builder.BaseBuilder
            :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig
        """
        # pylint: disable=too-many-arguments
        super(GRPCDataset, self).__init__(dataset_builder=dataset_builder,
                                          cluster_config=cluster_config)
        self.hostname = hostname
        self.port = port
        self.model_name = model_name
        self.signature = signature
        self.timeout = timeout
        self.nb_retries = nb_retries
        self.channel = None
        self.predict_stub = None
        self.config_stub = None

        # Waiting for port to be opened
        for retry_ix in range(connect_timeout):
            time.sleep(1)
            if is_port_opened(self.port, self.hostname):
                break
            if (retry_ix + 1) % 10 == 0:
                LOGGER.info(
                    'Trying to connect to TF Serving at %s:%d. Attempt %d / %d',
                    self.hostname, self.port, retry_ix + 1, connect_timeout)
        else:
            raise RuntimeError(
                'Unable to connect to %s:%d. Max attempts reached.' %
                (self.hostname, self.port))

        # Building the dataset
        self.build()
Example #6
0
def start_tf_serving_server(trainer,
                            force_cpu,
                            serving_id,
                            config,
                            endpoint_only=False):
    """ Starts the TF Serving server
        :param trainer: A reinforcement learning trainer instance.
        :param force_cpu: Boolean that indicates that we want the TF serving server to only use the CPU.
        :param serving_id: Integer that represents the serving server id (i.e. when multiple servers are launched)
        :param config: The configuration to set on the serving on launch (None to set no config on launch)
        :param endpoint_only: Boolean that indicates to only launch a sentinel to send orders to another server.
        :type trainer: diplomacy_research.models.training.reinforcement.trainer.ReinforcementTrainer
    """
    assert 'grpc' not in sys.modules, 'gRPC should not be loaded on the main thread.'

    # Making sure we have a model on disk first
    ensure_model_on_disk(trainer)

    # Launching sentinel
    port = get_tf_serving_port(trainer.cluster_config, serving_id)
    trainer.thread_pipes[serving_id], trainer.sentinel_pipes[
        serving_id] = SPAWN_CONTEXT.Pipe()
    trainer.sentinels[serving_id] = \
        SPAWN_CONTEXT.Process(target=start_serving_sentinel,
                              kwargs={'pipe': trainer.sentinel_pipes[serving_id],
                                      'port': port,
                                      'save_dir': trainer.flags.save_dir,
                                      'force_cpu': force_cpu,
                                      'config': config,
                                      'adapter_ctor': trainer.adapter_constructor,
                                      'dataset_builder_ctor': trainer.dataset_builder_constructor,
                                      'cluster_config': trainer.cluster_config,
                                      'endpoint_only': endpoint_only})
    trainer.sentinels[serving_id].start()

    # Waiting for server
    for attempt_ix in range(300):
        time.sleep(1)
        if is_port_opened(port):
            break
        if (attempt_ix + 1) % 10 == 0:
            LOGGER.info('Waiting for TF Serving to load. Attempt %d / %d.',
                        attempt_ix + 1, 300)
    else:
        LOGGER.error('The TF Serving server did not come online.')
        raise RuntimeError('Unable to contact the serving server.')
    LOGGER.info('Successfully connected to TF Serving.')
Example #7
0
    def build(self):
        """ Builds the channel and the stub """
        import grpc
        from diplomacy_research.proto.tensorflow_serving.apis.prediction_service_pb2_grpc import PredictionServiceStub

        assert 'request_id' in self.proto_fields, 'You need to have a "request_id" field.'
        assert is_port_opened(
            self.port,
            self.hostname), 'Unable to connect to %s:%d.' % (self.hostname,
                                                             self.port)

        # Creating insecure channel with corresponding stubs
        self.channel = grpc.insecure_channel('%s:%d' %
                                             (self.hostname, self.port))
        self.predict_stub = PredictionServiceStub(self.channel)

        # Padding output shapes with None
        output_types = self.dataset_builder.output_types
        output_shapes = self.dataset_builder.output_shapes
        output_shapes = {
            key: [None] + list(shape)
            for key, shape in output_shapes.items()
        }

        # Building a list of generic default values from the output types and output shapes
        for feature_name, feature_shape in output_shapes.items():
            if output_types[feature_name] == np.object:
                self.default_features[feature_name] = make_tensor_proto(
                    bytes('', 'utf-8'), dtype=np.object, shape=[1])
            elif isinstance(self.proto_fields[feature_name], VarProtoField):
                self.default_features[feature_name] = make_tensor_proto(
                    [], dtype=output_types[feature_name], shape=[1, 0])
            else:
                self.default_features[feature_name] = make_tensor_proto(
                    0,
                    dtype=output_types[feature_name],
                    shape=[1] + feature_shape[1:])
Example #8
0
        callbacks = []
        name = os.path.abspath(glob_pattern)
        for stats_name in args.stats:
            if stats_name == 'ranking':
                stats_callback = lambda games: print_ranking_stats(name, games)
            else:
                continue

            callbacks.append(stats_callback)

        for game in games:
            callback_array([game], callbacks)

    else:
        print('--games=[{}] --stats=[{}] --save-dir=[{}] --existing-games-dir=[{}] '
              '--rules=[{}] --exclude-daide=[{}] --seed=[{}]'
              .format(args.games, args.stats, args.save_dir, args.existing_games_dir,
                      args.rules, args.exclude_daide, args.seed))

        IO_LOOP = ioloop.IOLoop.instance()
        IO_LOOP.spawn_callback(main)
        try:
            start_server(IO_LOOP)
        except KeyboardInterrupt:
            pass
        finally:
            stop_io_loop(IO_LOOP)
            for port in OPEN_PORTS:
                if is_port_opened(port):
                    kill_processes_using_port(port, force=True)
Example #9
0
def launch_serving(model_url, serving_port, first_launch=True):
    """ Launches or relaunches the TF Serving process
        :param model_url: The URL to use to download the model
        :param serving_port: The port to use for TF serving
        :param first_launch: Boolean that indicates if this is the first launch or a relaunch
    """
    model_url = model_url or ''
    bot_filename = model_url.split('/')[-1]
    bot_name = bot_filename.split('.')[0]
    bot_directory = os.path.join(WORKING_DIR, 'data', 'bot_%s' % bot_name)
    bot_model = os.path.join(bot_directory, bot_filename)

    # If first launch, downloading the model
    if first_launch:
        shutil.rmtree(bot_directory, ignore_errors=True)
        os.makedirs(bot_directory, exist_ok=True)

        # Downloading model
        download_file(model_url, bot_model, force=True)

        # Unzipping file
        zip_ref = zipfile.ZipFile(bot_model, 'r')
        zip_ref.extractall(bot_directory)
        zip_ref.close()

    # Otherwise, restarting the serving
    elif is_port_opened(serving_port):
        kill_processes_using_port(serving_port)

    # Launching a new process
    log_file_path = os.path.join(WORKING_DIR, 'data',
                                 'log_serving_%d.txt' % serving_port)
    serving_process = Process(target=start_tf_serving,
                              args=(serving_port, WORKING_DIR),
                              kwargs={
                                  'force_cpu': True,
                                  'log_file_path': log_file_path
                              })
    serving_process.start()
    kill_subprocesses_on_exit()

    # Waiting for port to be opened.
    for attempt_ix in range(90):
        time.sleep(10)
        if is_port_opened(serving_port):
            break
        LOGGER.info('Waiting for TF Serving to come online. - Attempt %d / %d',
                    attempt_ix + 1, 90)
    else:
        LOGGER.error('TF Serving is not online after 15 minutes. Aborting.')
        raise RuntimeError()

    # Setting configuration
    new_config = ModelConfig(name='player',
                             base_path='/work_dir/data/bot_%s' % bot_name,
                             version_policy=None)
    for _ in range(30):
        if GRPCDataset.set_config('localhost', serving_port, new_config):
            LOGGER.info('Configuration set successfully.')
            break
        time.sleep(5.)
    else:
        LOGGER.error('Unable to set the configuration file.')
Example #10
0
def monitor_tf_serving(pipe,
                       port,
                       save_dir,
                       force_cpu,
                       config,
                       adapter_ctor,
                       dataset_builder_ctor,
                       cluster_config=None,
                       endpoint_only=False):
    """ Launches and monitors a TF serving server and restarts it if trouble is detected.
        :param pipe: The multiprocessing pipe to communicate with the main thread
        :param port: Integer. The port to use for the TF serving server.
        :param save_dir: The current flags.save_dir
        :param force_cpu: Boolean that indicates that we want the TF serving server to only use the CPU.
        :param config: The configuration to set on the serving on launch (None to set no config on launch)
        :param adapter_ctor: The constructor to build the adapter to query orders, values and policy details
        :param dataset_builder_ctor: The constructor of `BaseBuilder` to set the required proto fields
        :param cluster_config: The cluster configuration used for distributed training.
        :param endpoint_only: Boolean that indicates to only launch a sentinel to send orders to another server.

        :type pipe: multiprocessing.connection.Pipe
        :type adapter_ctor: diplomacy_research.models.policy.base_policy_adapter.BasePolicyAdapter.__class__
        :type dataset_builder_ctor: diplomacy_research.models.datasets.base_builder.BaseBuilder.__class__
        :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig
    """
    # pylint: disable=too-many-arguments

    # Waiting until we have at least 1 model in the save_dir before starting the server
    # Raising an exception after 5 mins
    for attempt_ix in range(300):
        if glob.glob('%s/*/saved_model.pb' %
                     (os.path.join(save_dir, 'serving', 'player'))):
            break
        time.sleep(1)
        if (attempt_ix + 1) % 30 == 0:
            LOGGER.info(
                'Waiting for TF model to be saved to disk. Attempt %d / 300.',
                attempt_ix + 1)
    else:
        LOGGER.error(
            'The TF model was not detected on disk after 300 seconds. Aborting.'
        )
        raise RuntimeError('TF serving not detected on disk')

    # Launching tf serving in a separate thread
    if not endpoint_only and not is_port_opened(port):
        task_launch_serving(port, save_dir, force_cpu, config, cluster_config)

    # Creating a game to monitor players
    game = Game()

    # Creating a model-based player for each configuration received
    player_models = []
    if not endpoint_only:
        player_models = task_get_player_models(port, config, adapter_ctor,
                                               dataset_builder_ctor,
                                               cluster_config)

    # Detects if we can monitor and restart the server
    monitoring_enabled = bool(player_models)
    if not monitoring_enabled and not endpoint_only:
        LOGGER.warning(
            'A configuration was not provided. Serving monitoring has been disabled until it is received.'
        )

    # Cannot do the monitoring if the config is not passed
    assert player_models or endpoint_only, 'A configuration is required when the serving is not only an endpoint.'

    # Processing tasks and monitoring server
    last_status_time = 0
    while True:

        # Monitor server (every 30 secs)
        if monitoring_enabled and (time.time() - last_status_time) >= 30:
            status_ok = yield task_monitor_serving(
                player_models,
                game,
                port,
                save_dir,
                force_cpu=force_cpu,
                config=config,
                cluster_config=cluster_config)
            if not status_ok:
                continue
            last_status_time = time.time()

        # Performing requests
        while pipe.poll():
            request_id, request_name, request_args = pipe.recv()

            # Check Opening Orders
            if request_name == 'check_opening_orders':
                yield task_check_openings(player_models)

            # Update Config
            elif request_name == 'update':
                config = request_args
                yield task_set_config(port, config)

                # Regenerating players for serving monitoring
                if not endpoint_only:
                    player_models = task_get_player_models(
                        port=port,
                        config=config,
                        adapter_ctor=adapter_ctor,
                        dataset_builder_ctor=dataset_builder_ctor,
                        cluster_config=cluster_config)
                    if not monitoring_enabled and player_models:
                        LOGGER.info('Serving monitoring is now enabled.')
                    elif monitoring_enabled and not player_models:
                        LOGGER.info('Serving monitoring is now disabled.')
                    monitoring_enabled = bool(player_models)

            # Wait for version
            elif request_name == 'wait_for_version':
                yield task_wait_for_version(port, **request_args)

            else:
                LOGGER.error('Unknown request: %s - Skipping.', request_name)

            # Marking request as processed
            pipe.send((request_id, int(time.time())))

        # Throttling
        yield gen.sleep(0.1)
Example #11
0
def is_redis_running(hostname='127.0.0.1'):
    """ Checks is Redis is running on the specified hostname """
    return is_port_opened(port=6379, hostname=hostname)
Example #12
0
def start_tf_serving(port,
                     save_dir,
                     batching_parameters=None,
                     cluster_config=None,
                     poll_time=1,
                     force_cpu=False,
                     log_file_path=None):
    """ Starts the tf serving server locally
        :param port: Integer. The port to open for incoming connections.
        :param save_dir: The current flags.save_dir
        :param batching_parameters: A BatchingParameters named tuple. Otherwise, uses the default.
        :param cluster_config: The cluster configuration used for distributed training.
        :param poll_time: The number of seconds between polls on the file system to check for new version.
        :param force_cpu: Boolean. If true, forces the serving to rn on CPU. Otherwise uses CUDA_VISIBLE_DEVICES.
        :param log_file_path: Optional. Specify the path of log file where to output std.out and std.err
        :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig
        Note: This automatically blocks the thread
    """
    if is_port_opened(port):
        LOGGER.error(
            'The port %d is already opened locally. Not starting TF Serving.',
            port)
        return

    # Creating serving directory
    os.makedirs(os.path.join(save_dir, 'serving'), exist_ok=True)
    file_suffix = '' if not cluster_config else '_%s.%03d' % (
        cluster_config.job_name, cluster_config.task_id)

    # Copying env variables
    new_env = os.environ.copy()
    if force_cpu:
        new_env['CUDA_VISIBLE_DEVICES'] = ''

    # If log_file_path is set, redirecting stdout and stderr to it.
    stdout = open(log_file_path, 'a') if log_file_path else None

    # Creating batch parameters config file
    batching_parameters = batching_parameters or BatchingParameters(
        max_batch_size=64,
        batch_timeout_micros=250000,
        max_enqueued_batches=256,
        num_batch_threads=multiprocessing.cpu_count(),
        pad_variable_length_inputs=True)
    filename = 'batch%s.txt' % file_suffix
    with open(os.path.join(save_dir, 'serving', filename), 'w') as file:
        file.write(str(batching_parameters))

    # In production, (inside container)
    # Launching directly
    if IN_PRODUCTION:
        command = [
            'tensorflow_model_server',
            '--port=%d' % port, '--enable_batching=true',
            '--batching_parameters_file=%s' %
            os.path.join(save_dir, 'serving', filename),
            '--model_base_path=/data/serving/',
            '--file_system_poll_wait_seconds=%d' % poll_time
        ]

    # Otherwise, downloading containers and starting singularity
    else:
        # Downloading container
        tf_serving_img = os.path.join(WORKING_DIR, 'containers',
                                      TF_SERVING_DOWNLOAD_URL.split('/')[-1])
        download_file(TF_SERVING_DOWNLOAD_URL, tf_serving_img)

        # Never launching an instance, since we need a different port every time
        command = [
            'singularity', 'exec', '-B',
            '%s:/work_dir' % save_dir, tf_serving_img,
            'tensorflow_model_server',
            '--port=%d' % port, '--enable_batching=true',
            '--batching_parameters_file=/work_dir/serving/%s' % filename,
            '--model_base_path=/data/serving/',
            '--file_system_poll_wait_seconds=%d' % poll_time
        ]

    # Launching process
    _start_process(command,
                   block_thread=True,
                   check_fn=lambda: is_port_opened(port),
                   bufsize=0,
                   env=new_env,
                   stdout=stdout,
                   stderr=stdout)