def __init__(self, temperature=0.1, use_beam=False, port=9502, name=None): """ Constructor :param temperature: The temperature to apply to the logits. :param use_beam: Boolean that indicates that we want to use a beam search. :param port: The port to use for the tf serving to query the model. :param name: Optional. The name of this player. """ model_url = 'https://f002.backblazeb2.com/file/ppaquette-public/benchmarks/neurips2019-rl_model.zip' # Creating serving if port is not open if not is_port_opened(port): launch_serving(model_url, port) # Creating adapter grpc_dataset = GRPCDataset( hostname='localhost', port=port, model_name='player', signature=rl_neurips2019.PolicyAdapter.get_signature(), dataset_builder=rl_neurips2019.BaseDatasetBuilder()) policy_adapter = rl_neurips2019.PolicyAdapter(grpc_dataset) # Building benchmark model super(DipNetRLPlayer, self).__init__(policy_adapter=policy_adapter, temperature=temperature, use_beam=use_beam, name=name)
def build_train_server(trainer): """ Builds the Tensorflow tf.train.Server :param trainer: A reinforcement learning trainer instance. :type trainer: diplomacy_research.models.training.reinforcement.trainer.ReinforcementTrainer """ if not trainer.cluster_config: return from diplomacy_research.utils.tensorflow import tf task_address = trainer.cluster_config.cluster_spec[ trainer.cluster_config.job_name][trainer.cluster_config.task_id] # Making port is not used by another process task_port = int(task_address.split(':')[1]) LOGGER.info('Killing any processes that have port %d opened.', task_port) kill_processes_using_port(task_port) # Starting server LOGGER.info('Starting server with task id %d - Address: %s', trainer.cluster_config.task_id, task_address) LOGGER.info('Checking if port %d is already opened: %s', task_port, str(is_port_opened(task_port))) trainer.server = tf.train.Server(tf.train.ClusterSpec( trainer.cluster_config.cluster_spec), job_name=trainer.cluster_config.job_name, task_index=trainer.cluster_config.task_id, protocol=trainer.cluster_config.protocol) LOGGER.info( 'Server successfully started. Trying to contact other nodes...')
def task_launch_serving(port, save_dir, force_cpu, config, cluster_config=None): """ Launches (or restarts) a TF serving server :param port: Integer. The port to use for the TF serving server. :param save_dir: The current flags.save_dir :param force_cpu: Launches the tf serving on CPU, otherwise uses GPU when using distributed training. :param config: The configuration to set on the serving on launch (None to set no config on launch) :param cluster_config: The cluster configuration used for distributed training. """ kill_processes_using_port(port) kill_subprocesses_on_exit() # Computing launch settings simult_players = multiprocessing.cpu_count( ) * 7 # launching nb_cpus processes max_batch_size = 2**int(math.log( 0.9 * simult_players, 2)) # rounding down to nearest exponent of 2. batch_timeout = 200000 batching_parameters = BatchingParameters( max_batch_size=max_batch_size, batch_timeout_micros=batch_timeout, max_enqueued_batches=256, num_batch_threads=multiprocessing.cpu_count(), pad_variable_length_inputs=True) log_file_path = None if not cluster_config: log_file_path = os.path.join(save_dir, 'tf_serving_%d.log' % port) elif get_tf_serving_port(cluster_config, serving_id=0) != port: log_file_path = os.path.join( save_dir, 'tf_serving_%s_%d.log' % (cluster_config.job_name, port)) force_cpu = force_cpu or bool(cluster_config is None) # Launching tf_serving_thread = Thread(target=start_tf_serving, args=(port, save_dir, batching_parameters, cluster_config), kwargs={ 'force_cpu': force_cpu, 'log_file_path': log_file_path }) tf_serving_thread.start() # Waiting for port for _ in range(120): if is_port_opened(port): break yield gen.sleep(1) else: LOGGER.error('Unable to connect to TF Serving after 2 mins.') # Setting configuration yield task_set_config(port, config)
def launch_serving(model_name, serving_port): """ Launches or relaunches the TF Serving process """ # Stop all serving child processes if is_port_opened(serving_port): kill_processes_using_port(serving_port) # Launching a new process log_file_path = os.path.join(WORKING_DIR, 'data', 'log_serving_%d.txt' % serving_port) serving_process = Process(target=start_tf_serving, args=(serving_port, WORKING_DIR), kwargs={ 'force_cpu': True, 'log_file_path': log_file_path }) serving_process.start() kill_subprocesses_on_exit() # Waiting for port to be opened. for attempt_idx in range(30): time.sleep(10) if is_port_opened(serving_port): break LOGGER.info('Waiting for TF Serving to come online. - Attempt %d / %d', attempt_idx + 1, 30) else: LOGGER.error('TF Serving is not online after 5 minutes. Aborting.') raise RuntimeError() # Setting configuration new_config = ModelConfig(name='player', base_path='/work_dir/data/bot_%s' % model_name, version_policy=None) for _ in range(30): if GRPCDataset.set_config('localhost', serving_port, new_config): LOGGER.info('Configuration set successfully.') break time.sleep(5.) else: LOGGER.error('Unable to set the configuration file.')
def __init__(self, hostname, port, model_name, signature, dataset_builder, cluster_config=None, connect_timeout=300, timeout=30, nb_retries=100): """ Constructor :param hostname: The hostname of the TensorFlow Serving server :param port: The port used by the TensorFlow Serving server. :param model_name: The name of the model being served by the TensorFlow Serving server. :param signature: The output of adapter.get_signature() - signature of all the possible calls :param dataset_builder: An instance of `BaseBuilder` containing the proto-fields and generation methods :param cluster_config: Optional. If set, the cluster configuration will be used for distributed training. :param connect_timeout: The timeout to try to connect to the TF serving server. :param timeout: The timeout (in seconds) to wait for a request :param nb_retries: The number of retries in case of grpc.RpcError before giving up. :type dataset_builder: diplomacy_research.models.datasets.base_builder.BaseBuilder :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig """ # pylint: disable=too-many-arguments super(GRPCDataset, self).__init__(dataset_builder=dataset_builder, cluster_config=cluster_config) self.hostname = hostname self.port = port self.model_name = model_name self.signature = signature self.timeout = timeout self.nb_retries = nb_retries self.channel = None self.predict_stub = None self.config_stub = None # Waiting for port to be opened for retry_ix in range(connect_timeout): time.sleep(1) if is_port_opened(self.port, self.hostname): break if (retry_ix + 1) % 10 == 0: LOGGER.info( 'Trying to connect to TF Serving at %s:%d. Attempt %d / %d', self.hostname, self.port, retry_ix + 1, connect_timeout) else: raise RuntimeError( 'Unable to connect to %s:%d. Max attempts reached.' % (self.hostname, self.port)) # Building the dataset self.build()
def start_tf_serving_server(trainer, force_cpu, serving_id, config, endpoint_only=False): """ Starts the TF Serving server :param trainer: A reinforcement learning trainer instance. :param force_cpu: Boolean that indicates that we want the TF serving server to only use the CPU. :param serving_id: Integer that represents the serving server id (i.e. when multiple servers are launched) :param config: The configuration to set on the serving on launch (None to set no config on launch) :param endpoint_only: Boolean that indicates to only launch a sentinel to send orders to another server. :type trainer: diplomacy_research.models.training.reinforcement.trainer.ReinforcementTrainer """ assert 'grpc' not in sys.modules, 'gRPC should not be loaded on the main thread.' # Making sure we have a model on disk first ensure_model_on_disk(trainer) # Launching sentinel port = get_tf_serving_port(trainer.cluster_config, serving_id) trainer.thread_pipes[serving_id], trainer.sentinel_pipes[ serving_id] = SPAWN_CONTEXT.Pipe() trainer.sentinels[serving_id] = \ SPAWN_CONTEXT.Process(target=start_serving_sentinel, kwargs={'pipe': trainer.sentinel_pipes[serving_id], 'port': port, 'save_dir': trainer.flags.save_dir, 'force_cpu': force_cpu, 'config': config, 'adapter_ctor': trainer.adapter_constructor, 'dataset_builder_ctor': trainer.dataset_builder_constructor, 'cluster_config': trainer.cluster_config, 'endpoint_only': endpoint_only}) trainer.sentinels[serving_id].start() # Waiting for server for attempt_ix in range(300): time.sleep(1) if is_port_opened(port): break if (attempt_ix + 1) % 10 == 0: LOGGER.info('Waiting for TF Serving to load. Attempt %d / %d.', attempt_ix + 1, 300) else: LOGGER.error('The TF Serving server did not come online.') raise RuntimeError('Unable to contact the serving server.') LOGGER.info('Successfully connected to TF Serving.')
def build(self): """ Builds the channel and the stub """ import grpc from diplomacy_research.proto.tensorflow_serving.apis.prediction_service_pb2_grpc import PredictionServiceStub assert 'request_id' in self.proto_fields, 'You need to have a "request_id" field.' assert is_port_opened( self.port, self.hostname), 'Unable to connect to %s:%d.' % (self.hostname, self.port) # Creating insecure channel with corresponding stubs self.channel = grpc.insecure_channel('%s:%d' % (self.hostname, self.port)) self.predict_stub = PredictionServiceStub(self.channel) # Padding output shapes with None output_types = self.dataset_builder.output_types output_shapes = self.dataset_builder.output_shapes output_shapes = { key: [None] + list(shape) for key, shape in output_shapes.items() } # Building a list of generic default values from the output types and output shapes for feature_name, feature_shape in output_shapes.items(): if output_types[feature_name] == np.object: self.default_features[feature_name] = make_tensor_proto( bytes('', 'utf-8'), dtype=np.object, shape=[1]) elif isinstance(self.proto_fields[feature_name], VarProtoField): self.default_features[feature_name] = make_tensor_proto( [], dtype=output_types[feature_name], shape=[1, 0]) else: self.default_features[feature_name] = make_tensor_proto( 0, dtype=output_types[feature_name], shape=[1] + feature_shape[1:])
callbacks = [] name = os.path.abspath(glob_pattern) for stats_name in args.stats: if stats_name == 'ranking': stats_callback = lambda games: print_ranking_stats(name, games) else: continue callbacks.append(stats_callback) for game in games: callback_array([game], callbacks) else: print('--games=[{}] --stats=[{}] --save-dir=[{}] --existing-games-dir=[{}] ' '--rules=[{}] --exclude-daide=[{}] --seed=[{}]' .format(args.games, args.stats, args.save_dir, args.existing_games_dir, args.rules, args.exclude_daide, args.seed)) IO_LOOP = ioloop.IOLoop.instance() IO_LOOP.spawn_callback(main) try: start_server(IO_LOOP) except KeyboardInterrupt: pass finally: stop_io_loop(IO_LOOP) for port in OPEN_PORTS: if is_port_opened(port): kill_processes_using_port(port, force=True)
def launch_serving(model_url, serving_port, first_launch=True): """ Launches or relaunches the TF Serving process :param model_url: The URL to use to download the model :param serving_port: The port to use for TF serving :param first_launch: Boolean that indicates if this is the first launch or a relaunch """ model_url = model_url or '' bot_filename = model_url.split('/')[-1] bot_name = bot_filename.split('.')[0] bot_directory = os.path.join(WORKING_DIR, 'data', 'bot_%s' % bot_name) bot_model = os.path.join(bot_directory, bot_filename) # If first launch, downloading the model if first_launch: shutil.rmtree(bot_directory, ignore_errors=True) os.makedirs(bot_directory, exist_ok=True) # Downloading model download_file(model_url, bot_model, force=True) # Unzipping file zip_ref = zipfile.ZipFile(bot_model, 'r') zip_ref.extractall(bot_directory) zip_ref.close() # Otherwise, restarting the serving elif is_port_opened(serving_port): kill_processes_using_port(serving_port) # Launching a new process log_file_path = os.path.join(WORKING_DIR, 'data', 'log_serving_%d.txt' % serving_port) serving_process = Process(target=start_tf_serving, args=(serving_port, WORKING_DIR), kwargs={ 'force_cpu': True, 'log_file_path': log_file_path }) serving_process.start() kill_subprocesses_on_exit() # Waiting for port to be opened. for attempt_ix in range(90): time.sleep(10) if is_port_opened(serving_port): break LOGGER.info('Waiting for TF Serving to come online. - Attempt %d / %d', attempt_ix + 1, 90) else: LOGGER.error('TF Serving is not online after 15 minutes. Aborting.') raise RuntimeError() # Setting configuration new_config = ModelConfig(name='player', base_path='/work_dir/data/bot_%s' % bot_name, version_policy=None) for _ in range(30): if GRPCDataset.set_config('localhost', serving_port, new_config): LOGGER.info('Configuration set successfully.') break time.sleep(5.) else: LOGGER.error('Unable to set the configuration file.')
def monitor_tf_serving(pipe, port, save_dir, force_cpu, config, adapter_ctor, dataset_builder_ctor, cluster_config=None, endpoint_only=False): """ Launches and monitors a TF serving server and restarts it if trouble is detected. :param pipe: The multiprocessing pipe to communicate with the main thread :param port: Integer. The port to use for the TF serving server. :param save_dir: The current flags.save_dir :param force_cpu: Boolean that indicates that we want the TF serving server to only use the CPU. :param config: The configuration to set on the serving on launch (None to set no config on launch) :param adapter_ctor: The constructor to build the adapter to query orders, values and policy details :param dataset_builder_ctor: The constructor of `BaseBuilder` to set the required proto fields :param cluster_config: The cluster configuration used for distributed training. :param endpoint_only: Boolean that indicates to only launch a sentinel to send orders to another server. :type pipe: multiprocessing.connection.Pipe :type adapter_ctor: diplomacy_research.models.policy.base_policy_adapter.BasePolicyAdapter.__class__ :type dataset_builder_ctor: diplomacy_research.models.datasets.base_builder.BaseBuilder.__class__ :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig """ # pylint: disable=too-many-arguments # Waiting until we have at least 1 model in the save_dir before starting the server # Raising an exception after 5 mins for attempt_ix in range(300): if glob.glob('%s/*/saved_model.pb' % (os.path.join(save_dir, 'serving', 'player'))): break time.sleep(1) if (attempt_ix + 1) % 30 == 0: LOGGER.info( 'Waiting for TF model to be saved to disk. Attempt %d / 300.', attempt_ix + 1) else: LOGGER.error( 'The TF model was not detected on disk after 300 seconds. Aborting.' ) raise RuntimeError('TF serving not detected on disk') # Launching tf serving in a separate thread if not endpoint_only and not is_port_opened(port): task_launch_serving(port, save_dir, force_cpu, config, cluster_config) # Creating a game to monitor players game = Game() # Creating a model-based player for each configuration received player_models = [] if not endpoint_only: player_models = task_get_player_models(port, config, adapter_ctor, dataset_builder_ctor, cluster_config) # Detects if we can monitor and restart the server monitoring_enabled = bool(player_models) if not monitoring_enabled and not endpoint_only: LOGGER.warning( 'A configuration was not provided. Serving monitoring has been disabled until it is received.' ) # Cannot do the monitoring if the config is not passed assert player_models or endpoint_only, 'A configuration is required when the serving is not only an endpoint.' # Processing tasks and monitoring server last_status_time = 0 while True: # Monitor server (every 30 secs) if monitoring_enabled and (time.time() - last_status_time) >= 30: status_ok = yield task_monitor_serving( player_models, game, port, save_dir, force_cpu=force_cpu, config=config, cluster_config=cluster_config) if not status_ok: continue last_status_time = time.time() # Performing requests while pipe.poll(): request_id, request_name, request_args = pipe.recv() # Check Opening Orders if request_name == 'check_opening_orders': yield task_check_openings(player_models) # Update Config elif request_name == 'update': config = request_args yield task_set_config(port, config) # Regenerating players for serving monitoring if not endpoint_only: player_models = task_get_player_models( port=port, config=config, adapter_ctor=adapter_ctor, dataset_builder_ctor=dataset_builder_ctor, cluster_config=cluster_config) if not monitoring_enabled and player_models: LOGGER.info('Serving monitoring is now enabled.') elif monitoring_enabled and not player_models: LOGGER.info('Serving monitoring is now disabled.') monitoring_enabled = bool(player_models) # Wait for version elif request_name == 'wait_for_version': yield task_wait_for_version(port, **request_args) else: LOGGER.error('Unknown request: %s - Skipping.', request_name) # Marking request as processed pipe.send((request_id, int(time.time()))) # Throttling yield gen.sleep(0.1)
def is_redis_running(hostname='127.0.0.1'): """ Checks is Redis is running on the specified hostname """ return is_port_opened(port=6379, hostname=hostname)
def start_tf_serving(port, save_dir, batching_parameters=None, cluster_config=None, poll_time=1, force_cpu=False, log_file_path=None): """ Starts the tf serving server locally :param port: Integer. The port to open for incoming connections. :param save_dir: The current flags.save_dir :param batching_parameters: A BatchingParameters named tuple. Otherwise, uses the default. :param cluster_config: The cluster configuration used for distributed training. :param poll_time: The number of seconds between polls on the file system to check for new version. :param force_cpu: Boolean. If true, forces the serving to rn on CPU. Otherwise uses CUDA_VISIBLE_DEVICES. :param log_file_path: Optional. Specify the path of log file where to output std.out and std.err :type cluster_config: diplomacy_research.utils.cluster.ClusterConfig Note: This automatically blocks the thread """ if is_port_opened(port): LOGGER.error( 'The port %d is already opened locally. Not starting TF Serving.', port) return # Creating serving directory os.makedirs(os.path.join(save_dir, 'serving'), exist_ok=True) file_suffix = '' if not cluster_config else '_%s.%03d' % ( cluster_config.job_name, cluster_config.task_id) # Copying env variables new_env = os.environ.copy() if force_cpu: new_env['CUDA_VISIBLE_DEVICES'] = '' # If log_file_path is set, redirecting stdout and stderr to it. stdout = open(log_file_path, 'a') if log_file_path else None # Creating batch parameters config file batching_parameters = batching_parameters or BatchingParameters( max_batch_size=64, batch_timeout_micros=250000, max_enqueued_batches=256, num_batch_threads=multiprocessing.cpu_count(), pad_variable_length_inputs=True) filename = 'batch%s.txt' % file_suffix with open(os.path.join(save_dir, 'serving', filename), 'w') as file: file.write(str(batching_parameters)) # In production, (inside container) # Launching directly if IN_PRODUCTION: command = [ 'tensorflow_model_server', '--port=%d' % port, '--enable_batching=true', '--batching_parameters_file=%s' % os.path.join(save_dir, 'serving', filename), '--model_base_path=/data/serving/', '--file_system_poll_wait_seconds=%d' % poll_time ] # Otherwise, downloading containers and starting singularity else: # Downloading container tf_serving_img = os.path.join(WORKING_DIR, 'containers', TF_SERVING_DOWNLOAD_URL.split('/')[-1]) download_file(TF_SERVING_DOWNLOAD_URL, tf_serving_img) # Never launching an instance, since we need a different port every time command = [ 'singularity', 'exec', '-B', '%s:/work_dir' % save_dir, tf_serving_img, 'tensorflow_model_server', '--port=%d' % port, '--enable_batching=true', '--batching_parameters_file=/work_dir/serving/%s' % filename, '--model_base_path=/data/serving/', '--file_system_poll_wait_seconds=%d' % poll_time ] # Launching process _start_process(command, block_thread=True, check_fn=lambda: is_port_opened(port), bufsize=0, env=new_env, stdout=stdout, stderr=stdout)