コード例 #1
0
    def _save_tf_model_to_store(self, checkpoint):
        # rl coach .coach_checkpoint state file
        state_file = checkpoint.rl_coach_checkpoint.coach_checkpoint_state_file

        # upload tensorflow models
        checkpoint.tensorflow_model.persist(
            coach_checkpoint_state_file=state_file,
            s3_kms_extra_args=get_s3_kms_extra_args())

        # persist rl coach checkpoint
        checkpoint.rl_coach_checkpoint.persist(
            s3_kms_extra_args=get_s3_kms_extra_args())

        # Upload the frozen graph which is used for deployment
        if self.graph_manager:
            checkpoint.tensorflow_model.persist_tensorflow_frozen_graph(
                agent_name=checkpoint.agent_name,
                graph_manager=self.graph_manager,
                coach_checkpoint_state_file=state_file,
                best_checkpoint_number=checkpoint.deepracer_checkpoint_json.
                get_deepracer_best_checkpoint_number(),
                last_checkpoint_number=checkpoint.deepracer_checkpoint_json.
                get_deepracer_last_checkpoint_number(),
                s3_kms_extra_args=get_s3_kms_extra_args())

        # Clean up old checkpoints
        checkpoint.tensorflow_model.delete(
            coach_checkpoint_state_file=state_file,
            best_checkpoint=checkpoint.deepracer_checkpoint_json.
            get_deepracer_best_checkpoint())
コード例 #2
0
    def flush_finished(self):
        """upload rl coach .finished file"""
        try:
            # remove lock file if it exists
            self.syncfile_lock.delete()
            # acquire lock
            self.syncfile_lock.persist(s3_kms_extra_args=get_s3_kms_extra_args())
            for _, checkpoint in self.params.checkpoint_dict.items():
                # upload .finished
                checkpoint.syncfile_finished.persist(s3_kms_extra_args=get_s3_kms_extra_args())

            # release lock by delete it
            self.syncfile_lock.delete()
        except botocore.exceptions.ClientError:
            log_and_exit(
                "Unable to upload .finished",
                SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_400,
            )
        except Exception as ex:
            log_and_exit(
                "Exception in uploading .finished file: {}".format(ex),
                SIMAPP_S3_DATA_STORE_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_500,
            )
コード例 #3
0
def _validate(graph_manager, task_parameters, transitions, s3_bucket,
              s3_prefix, aws_region):
    checkpoint = graph_manager.data_store.params.checkpoint_dict['agent']
    checkpoint_dir = task_parameters.checkpoint_restore_path
    graph_manager.data_store.wait_for_checkpoints()

    # validate last checkpoint
    last_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_last_checkpoint(
    )
    if checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=last_model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args()):
        screen.log_title(" Validating Last Checkpoint: {}".format(
            last_model_checkpoint_name))
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " emulate_act_on_trainer on Last Checkpoint completed!")
        # validate best checkpoint: Best checkpoint might not exist.
        best_model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        if checkpoint.rl_coach_checkpoint.update(
                model_checkpoint_name=best_model_checkpoint_name,
                s3_kms_extra_args=utils.get_s3_kms_extra_args()):
            screen.log_title(" Validating Best Checkpoint: {}".format(
                best_model_checkpoint_name))
            # load the best rl coach checkpoint from store
            graph_manager.data_store.load_from_store()
            graph_manager.restore_checkpoint()
            screen.log_title(
                " Start emulate_act_on_trainer on Best Checkpoint")
            graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                                 transitions=transitions)
            screen.log_title(
                " emulate_act_on_trainer on Best Checkpoint completed!")
        else:
            screen.log_title(" No Best Checkpoint to validate.")

    else:
        screen.log_title(" Validating Last Checkpoint")
        # load the last rl coach checkpoint from store
        graph_manager.data_store.load_from_store()
        graph_manager.create_graph(task_parameters)
        graph_manager.phase = RunPhase.TEST
        screen.log_title(" Start emulate_act_on_trainer on Last Checkpoint ")
        graph_manager.emulate_act_on_trainer(EnvironmentSteps(1),
                                             transitions=transitions)
        screen.log_title(
            " Start emulate_act_on_trainer on Last Checkpoint completed!")
    screen.log_title(" Validation completed!")
コード例 #4
0
    def __init__(self,
                 params: S3BotoDataStoreParameters,
                 graph_manager: MultiAgentGraphManager,
                 ignore_lock: bool = False):
        self.params = params
        self.key_prefixes = dict()
        self.ip_data_keys = dict()
        self.ip_done_keys = dict()
        self.preset_data_keys = dict()
        self.delete_queues = dict()
        for agent_key, s3_folder in self.params.s3_folders.items():
            self.key_prefixes[agent_key] = os.path.join(s3_folder, "model")
            self.ip_data_keys[agent_key] = os.path.join(
                s3_folder, "ip/ip.json")
            self.ip_done_keys[agent_key] = os.path.join(s3_folder, "ip/done")
            self.preset_data_keys[agent_key] = os.path.join(
                s3_folder, "presets/preset.py")
            self.delete_queues[agent_key] = queue.Queue()
        if not graph_manager:
            log_and_exit("None type for graph manager",
                         SIMAPP_S3_DATA_STORE_EXCEPTION,
                         SIMAPP_EVENT_ERROR_CODE_500)

        self.graph_manager = graph_manager
        self.ignore_lock = ignore_lock
        self.s3_extra_args = get_s3_kms_extra_args()
コード例 #5
0
def write_metrics_to_s3(bucket, key, region, metrics):
    '''Helper method that uploads the desired metrics to s3
       bucket - String with S3 bucket where metrics should be written
       key - String with S3 bucket key where metrics should be written
       region - String with aws region
       metrics - Dictionary with metrics to write to s3
    '''
    try:
        s3_extra_args = get_s3_kms_extra_args()
        session = boto3.session.Session()
        s3_client = session.client('s3',
                                   region_name=region,
                                   config=get_boto_config())
        s3_client.put_object(Bucket=bucket,
                             Key=key,
                             Body=bytes(json.dumps(metrics), encoding='utf-8'),
                             **s3_extra_args)
    except botocore.exceptions.ClientError as err:
        log_and_exit(
            "Unable to write metrics to s3: bucket: {}, error: {}".format(
                bucket, err.response['Error']['Code']),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_400)
    except Exception as ex:
        log_and_exit("Unable to write metrics to s3, exception: {}".format(ex),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_500)
コード例 #6
0
    def make_compatible(self, syncfile_ready):
        """update coach checkpoint file to make it compatible

        Args:
            syncfile_ready (RlCoachSyncFile): RlCoachSyncFile class instance for .ready file
        """
        try:
            # download old coach checkpoint
            self._s3_client.download_file(bucket=self._bucket,
                                          s3_key=self._old_s3_key,
                                          local_path=self._old_local_path)
            # parse old coach checkpoint
            with open(self._old_local_path) as old_coach_checkpoint_file:
                coach_checkpoint_value = re.findall(
                    r'"(.*?)"', old_coach_checkpoint_file.readline())
            if len(coach_checkpoint_value) != 1:
                log_and_exit(
                    "No checkpoint file found",
                    SIMAPP_SIMULATION_WORKER_EXCEPTION,
                    SIMAPP_EVENT_ERROR_CODE_400,
                )
            # remove old local coach checkpoint
            os.remove(self._old_local_path)
            # Upload ready file so that the system can gab the checkpoints
            syncfile_ready.persist(s3_kms_extra_args=get_s3_kms_extra_args())
            # write new temp coach checkpoint file
            with open(self._temp_local_path,
                      "w+") as new_coach_checkpoint_file:
                new_coach_checkpoint_file.write(coach_checkpoint_value[0])
            # upload new temp coach checkpoint file
            self._persist_temp_coach_checkpoint(
                s3_kms_extra_args=get_s3_kms_extra_args())
            # remove new temp local coach checkpoint
            os.remove(self._temp_local_path)
        except botocore.exceptions.ClientError as e:
            log_and_exit(
                "Unable to make model compatible: {}, {}".format(
                    self._bucket, e.response["Error"]["Code"]),
                SIMAPP_SIMULATION_WORKER_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_400,
            )
        except Exception as e:
            log_and_exit(
                "Exception in making model compatible: {}".format(e),
                SIMAPP_SIMULATION_WORKER_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_500,
            )
コード例 #7
0
    def update(self, data):
        self._is_eval_ = data != RunPhase.TRAIN

        if not self._is_eval_ and self._use_model_picker:
            if self._eval_stats_dict_['chkpnt_name'] is None:
                self._eval_stats_dict_[
                    'chkpnt_name'] = self._checkpoint_state_.read().name

            self._eval_trials_ = 0
            mean_metric = statistics.mean(
                self._current_eval_best_model_metric_list_
            ) if self._current_eval_best_model_metric_list_ else None
            msg_format = '[BestModelSelection] Number of evaluations: {} Evaluation episode {}: {}'
            LOGGER.info(
                msg_format.format(
                    len(self._current_eval_best_model_metric_list_),
                    self._best_model_metric_type.value,
                    self._current_eval_best_model_metric_list_))
            LOGGER.info(
                '[BestModelSelection] Evaluation episode {} mean: {}'.format(
                    self._best_model_metric_type.value, mean_metric))
            self._current_eval_best_model_metric_list_.clear()

            time_stamp = self._current_sim_time
            if self._eval_stats_dict_['avg_eval_metric'] is None or \
                    mean_metric >= self._eval_stats_dict_['avg_eval_metric']:
                msg_format = '[BestModelSelection] current {0} mean: {1} >= best {0} mean: {2}'
                LOGGER.info(
                    msg_format.format(
                        self._best_model_metric_type.value, mean_metric,
                        self._eval_stats_dict_['avg_eval_metric']))
                msg_format = '[BestModelSelection] Updating the best checkpoint to "{}" from "{}".'
                LOGGER.info(
                    msg_format.format(self._eval_stats_dict_['chkpnt_name'],
                                      self._best_chkpnt_stats['name']))
                self._eval_stats_dict_['avg_eval_metric'] = mean_metric
                self._best_chkpnt_stats = {
                    'name': self._eval_stats_dict_['chkpnt_name'],
                    'avg_eval_metric': mean_metric,
                    'time_stamp': time_stamp
                }
            last_chkpnt_stats = {
                'name': self._eval_stats_dict_['chkpnt_name'],
                'avg_eval_metric': mean_metric,
                'time_stamp': time_stamp
            }
            self._deepracer_checkpoint_json.persist(
                body=json.dumps({
                    BEST_CHECKPOINT: self._best_chkpnt_stats,
                    LAST_CHECKPOINT: last_chkpnt_stats
                }),
                s3_kms_extra_args=get_s3_kms_extra_args())
            # Update the checkpoint name to the new checkpoint being used for training that will
            # then be evaluated, note this class gets notfied when the system is put into a
            # training phase and assumes that a training phase only starts when a new check point
            # is avaialble
            self._eval_stats_dict_[
                'chkpnt_name'] = self._checkpoint_state_.read().name
コード例 #8
0
 def __init__(self, bucket=None, s3_prefix=None, aws_region=None):
     self.aws_region = aws_region
     self.bucket = bucket
     self.s3_prefix = s3_prefix
     self.config_key = os.path.normpath(s3_prefix + "/ip/ip.json")
     self.hyperparameters_key = os.path.normpath(s3_prefix + "/ip/hyperparameters.json")
     self.done_file_key = os.path.normpath(s3_prefix + "/ip/done")
     self.model_checkpoints_prefix = os.path.normpath(s3_prefix + "/model/") + "/"
     self.s3_extra_args = get_s3_kms_extra_args()
     LOG.info("Initializing SageS3Client...")
コード例 #9
0
 def __init__(self, s3_bucket, s3_key, s3_endpoint_url=None):
     logger.info("simtrace_data init")
     DeepRacerRacetrackSimTraceData.__instance = self
     self.data_state = SIMTRACE_DATA_UPLOAD_UNKNOWN_STATE
     self.s3_bucket = s3_bucket
     self.s3_object_key = s3_key
     self.s3_endpoint_url = s3_endpoint_url
     if s3_key != "None":
         self.setup_mutipart_upload()
     self.s3_extra_args = utils.get_s3_kms_extra_args()
コード例 #10
0
 def upload_episode_metrics(self):
     # TODO: Service team can't handle "version" key in Evaluation Metrics due to
     # unknown key in the json. Training metrics change works fine as the metrics.json
     # file is directly loaded in Front-end Console while evaluation metrics file is loaded through
     # Service API and Service can't handle the keys in metrics file that is not defined in the service.
     # Keeping evaluation metrics as it is (without version key) as there is no change in the format anyway.
     # But we should make change in the future to match the format with Training metrics.
     json_metrics = json.dumps({'metrics': self._metrics_})
     self._s3_metrics.persist(body=json_metrics,
                              s3_kms_extra_args=get_s3_kms_extra_args())
コード例 #11
0
    def _update_sector_times(self, info_dict, sector_idx):
        """update curent personal, best personal, and sector best times.
        If there is a sector best time, upload the sector best time into s3 in
        a separted thread

        Args:
            info_dict(dict): infomation dictionary contains all necesary info to update sector times
            sector_idx(int): sector index for sector1 index is 0 and so on so forth

        Returns:
            dict: updated info_dict
        """
        sector = SECTOR_X_FORMAT.format(sector_idx + 1)
        curr_eval_time = info_dict[
            VirtualEventMP4Params.TOTAL_EVAL_SECONDS.value]
        last_eval_time = info_dict[
            VirtualEventMP4Params.LAST_EVAL_SECONDS.value]

        # get sector_time_dict
        sector_time_dict = info_dict[VirtualEventMP4Params.SECTOR_TIMES.value]
        sector_time_dict[self._current_personal_format.format(sector)] \
            = curr_eval_time - last_eval_time
        info_dict[
            VirtualEventMP4Params.LAST_EVAL_SECONDS.value] = curr_eval_time

        if sector_time_dict[self._best_session_format.format(
                sector)] is not None:
            # update sector best personal time
            if sector_time_dict[self._current_personal_format.format(sector)] <= \
                    sector_time_dict[self._best_personal_format.format(sector)]:
                sector_time_dict[self._best_personal_format.format(sector)] = \
                    sector_time_dict[self._current_personal_format.format(sector)]

            # update sector best session time
            if sector_time_dict[self._current_personal_format.format(sector)] <= \
                    sector_time_dict[self._best_session_format.format(sector)]:
                sector_time_dict[self._best_session_format.format(sector)] = \
                    sector_time_dict[self._current_personal_format.format(sector)]
                # persist the updated sector best session time with
                # other sectors into s3 for robomaker crash backup
                # in a new thread
                Thread(target=self._virtual_event_best_sector_time.persist,
                       args=(json.dumps({
                           SECTOR_X_FORMAT.format(idx + 1):
                           sector_time_dict[self._best_session_format.format(
                               SECTOR_X_FORMAT.format(idx + 1))]
                           for idx in range(self._total_sectors)
                       }), get_s3_kms_extra_args())).start()

        # update sector_time_dict to the latest
        info_dict[VirtualEventMP4Params.SECTOR_TIMES.value].update(
            sector_time_dict)
        return info_dict
コード例 #12
0
 def signal_ready(self):
     '''upload rl coach .ready file
     '''
     try:
         # remove lock file if it exists
         self.syncfile_lock.delete()
         # acquire lock
         self.syncfile_lock.persist(
             s3_kms_extra_args=get_s3_kms_extra_args())
         for _, checkpoint in self.params.checkpoint_dict.items():
             # upload .ready
             checkpoint.syncfile_ready.persist(
                 s3_kms_extra_args=get_s3_kms_extra_args())
         # release lock by delete it
         self.syncfile_lock.delete()
     except botocore.exceptions.ClientError:
         log_and_exit("Unable to upload .ready",
                      SIMAPP_S3_DATA_STORE_EXCEPTION,
                      SIMAPP_EVENT_ERROR_CODE_400)
     except Exception as ex:
         log_and_exit("Exception in uploading .ready file: {}".format(ex),
                      SIMAPP_S3_DATA_STORE_EXCEPTION,
                      SIMAPP_EVENT_ERROR_CODE_500)
コード例 #13
0
    def update(self, data):
        self._is_eval_ = data != RunPhase.TRAIN

        if not self._is_eval_ and self._use_model_picker:
            if self._eval_stats_dict_['chkpnt_name'] is None:
                self._eval_stats_dict_[
                    'chkpnt_name'] = self._checkpoint_state_.read().name

            self._eval_trials_ = 0
            mean_pct = statistics.mean(self._current_eval_pct_list_ if \
                                       self._current_eval_pct_list_ else [0.0])
            LOGGER.info(
                'Number of evaluations: {} Evaluation progresses: {}'.format(
                    len(self._current_eval_pct_list_),
                    self._current_eval_pct_list_))
            LOGGER.info('Evaluation progresses mean: {}'.format(mean_pct))
            self._current_eval_pct_list_.clear()

            time_stamp = self._current_sim_time
            if mean_pct >= self._eval_stats_dict_['avg_comp_pct']:
                LOGGER.info('Current mean: {} >= Current best mean: {}'.format(
                    mean_pct, self._eval_stats_dict_['avg_comp_pct']))
                LOGGER.info(
                    'Updating the best checkpoint to "{}" from "{}".'.format(
                        self._eval_stats_dict_['chkpnt_name'],
                        self._best_chkpnt_stats['name']))
                self._eval_stats_dict_['avg_comp_pct'] = mean_pct
                self._best_chkpnt_stats = {
                    'name': self._eval_stats_dict_['chkpnt_name'],
                    'avg_comp_pct': mean_pct,
                    'time_stamp': time_stamp
                }
            last_chkpnt_stats = {
                'name': self._eval_stats_dict_['chkpnt_name'],
                'avg_comp_pct': mean_pct,
                'time_stamp': time_stamp
            }
            self._deepracer_checkpoint_json.persist(
                body=json.dumps({
                    BEST_CHECKPOINT: self._best_chkpnt_stats,
                    LAST_CHECKPOINT: last_chkpnt_stats
                }),
                s3_kms_extra_args=get_s3_kms_extra_args())
            # Update the checkpoint name to the new checkpoint being used for training that will
            # then be evaluated, note this class gets notfied when the system is put into a
            # training phase and assumes that a training phase only starts when a new check point
            # is avaialble
            self._eval_stats_dict_[
                'chkpnt_name'] = self._checkpoint_state_.read().name
コード例 #14
0
def exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx):
    """Helper method that shutsdown the sim app if the trainer is done
    checkpoint_dir - direcotry where the done file would be downloaded to
    """
    if should_stop(checkpoint_dir):
        is_save_mp4_enabled = rospy.get_param("MP4_S3_BUCKET", None) and rollout_idx == 0
        if is_save_mp4_enabled:
            unsubscribe_from_save_mp4 = ServiceProxyWrapper(
                "/racecar/save_mp4/unsubscribe_from_save_mp4", Empty
            )
            unsubscribe_from_save_mp4(EmptyRequest())
        # upload simtrace and mp4 into s3 bucket
        for s3_writer in simtrace_video_s3_writers:
            s3_writer.persist(utils.get_s3_kms_extra_args())
        logger.info("Received termination signal from trainer. Goodbye.")
        simapp_exit_gracefully()
コード例 #15
0
 def upload_episode_metrics(self):
     json_metrics = json.dumps({
         'metrics':
         self._metrics_,
         'version':
         METRICS_VERSION,
         'best_model_metric':
         self._best_model_metric_type.value
     })
     self._s3_metrics.persist(body=json_metrics,
                              s3_kms_extra_args=get_s3_kms_extra_args())
     if self._is_eval_:
         if self._best_model_metric_type == BestModelMetricType.REWARD:
             self._current_eval_best_model_metric_list_.append(
                 self._episode_reward_)
         else:
             self._current_eval_best_model_metric_list_.append(
                 self._progress_)
コード例 #16
0
 def save_to_store(self):
     try:
         # remove lock file if it exists
         self.syncfile_lock.delete()
         # acquire lock
         self.syncfile_lock.persist(
             s3_kms_extra_args=get_s3_kms_extra_args())
         for _, checkpoint in self.params.checkpoint_dict.items():
             # upload tensorflow models, tensorflow frozen graph, and rl coach checkpoint
             self._save_tf_model_to_store(checkpoint)
         # release lock by delete it
         self.syncfile_lock.delete()
     except botocore.exceptions.ClientError:
         log_and_exit("Unable to upload checkpoint",
                      SIMAPP_S3_DATA_STORE_EXCEPTION,
                      SIMAPP_EVENT_ERROR_CODE_400)
     except Exception as ex:
         log_and_exit("Exception in uploading checkpoint: {}".format(ex),
                      SIMAPP_S3_DATA_STORE_EXCEPTION,
                      SIMAPP_EVENT_ERROR_CODE_500)
コード例 #17
0
def main():
    """ Main function for tournament"""
    try:
        # parse argument
        s3_region = sys.argv[1]
        s3_bucket = sys.argv[2]
        s3_prefix = sys.argv[3]
        s3_yaml_name = sys.argv[4]

        # create boto3 session/client and download yaml/json file
        session = boto3.session.Session()
        s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None)
        s3_client = session.client('s3',
                                   region_name=s3_region,
                                   endpoint_url=s3_endpoint_url,
                                   config=get_boto_config())

        yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
        local_yaml_path = os.path.abspath(
            os.path.join(os.getcwd(), s3_yaml_name))
        try:
            s3_client.download_file(Bucket=s3_bucket,
                                    Key=yaml_key,
                                    Filename=local_yaml_path)
        except Exception as e:
            log_and_exit(
                "Failed to download yaml file: s3_bucket: {}, yaml_key: {}, {}"
                .format(s3_bucket, yaml_key,
                        e), SIMAPP_SIMULATION_WORKER_EXCEPTION,
                SIMAPP_EVENT_ERROR_CODE_500)

        # Intermediate tournament files
        queue_pickle_name = 'tournament_candidate_queue.pkl'
        queue_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, queue_pickle_name))
        local_queue_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), queue_pickle_name))

        report_pickle_name = 'tournament_report.pkl'
        report_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, report_pickle_name))
        local_report_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), report_pickle_name))

        final_report_name = 'tournament_report.json'
        final_report_s3_key = os.path.normpath(
            os.path.join(s3_prefix, final_report_name))

        try:
            s3_client.download_file(Bucket=s3_bucket,
                                    Key=queue_pickle_s3_key,
                                    Filename=local_queue_pickle_path)
            s3_client.download_file(Bucket=s3_bucket,
                                    Key=report_pickle_s3_key,
                                    Filename=local_report_pickle_path)
        except:
            pass

        # Get values passed in yaml files. Default values are for backward compatibility and for single racecar racing
        yaml_dict = get_yaml_dict(local_yaml_path)

        # Forcing the yaml parameter to list
        # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME after cloud pushes this YAML parameter
        force_list_params = [
            MODEL_S3_BUCKET_YAML_KEY, MODEL_S3_PREFIX_YAML_KEY,
            MODEL_METADATA_FILE_S3_YAML_KEY, METRICS_S3_BUCKET_YAML_KEY,
            METRICS_S3_PREFIX_YAML_KEY, SIMTRACE_S3_BUCKET_YAML_KEY,
            SIMTRACE_S3_PREFIX_YAML_KEY, MP4_S3_BUCKET_YAML_KEY,
            MP4_S3_PREFIX_YAML_KEY, DISPLAY_NAME_YAML_KEY, RACER_NAME_YAML_KEY
        ]

        for params in force_list_params:
            yaml_dict[params] = force_list(yaml_dict.get(params, None))

        # Populate the model_metadata_s3_key values to handle both training and evaluation for all race_formats
        if None in yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY]:
            # MODEL_METADATA_FILE_S3_KEY not passed as part of yaml file ==> This happens during evaluation
            # Assume model_metadata.json is present in the s3_prefix/model/ folder
            yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY] = list()
            for s3_prefix in yaml_dict[MODEL_S3_PREFIX_YAML_KEY]:
                yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY].append(
                    os.path.join(s3_prefix, 'model/model_metadata.json'))

        # Validate the yaml values
        validate_yaml_values(yaml_dict)
        if os.path.exists(local_queue_pickle_path):
            with open(local_queue_pickle_path, 'rb') as f:
                tournament_candidate_queue = pickle.load(f)
            with open(local_report_pickle_path, 'rb') as f:
                tournament_report = pickle.load(f)
            logger.info('tournament_candidate_queue loaded from existing file')
        else:
            logger.info('tournament_candidate_queue initialized')
            tournament_candidate_queue = deque()
            for agent_idx, _ in enumerate(yaml_dict[MODEL_S3_BUCKET_YAML_KEY]):
                tournament_candidate_queue.append((
                    yaml_dict[MODEL_S3_BUCKET_YAML_KEY][agent_idx],
                    yaml_dict[MODEL_S3_PREFIX_YAML_KEY][agent_idx],
                    yaml_dict[MODEL_METADATA_FILE_S3_YAML_KEY][agent_idx],
                    yaml_dict[METRICS_S3_BUCKET_YAML_KEY][agent_idx],
                    yaml_dict[METRICS_S3_PREFIX_YAML_KEY][agent_idx],
                    yaml_dict[SIMTRACE_S3_BUCKET_YAML_KEY][agent_idx],
                    yaml_dict[SIMTRACE_S3_PREFIX_YAML_KEY][agent_idx],
                    yaml_dict[MP4_S3_BUCKET_YAML_KEY][agent_idx],
                    yaml_dict[MP4_S3_PREFIX_YAML_KEY][agent_idx],
                    yaml_dict[DISPLAY_NAME_YAML_KEY][agent_idx],
                    # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check
                    "" if None in yaml_dict[RACER_NAME_YAML_KEY] else
                    yaml_dict[RACER_NAME_YAML_KEY][agent_idx]))
            tournament_report = {"race_results": []}

        race_idx = len(tournament_report["race_results"])
        while len(tournament_candidate_queue) > 1:
            car1 = tournament_candidate_queue.popleft()
            car2 = tournament_candidate_queue.popleft()
            (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata,
             car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket,
             car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix,
             car1_display_name, car1_racer_name) = car1
            (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata,
             car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket,
             car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix,
             car2_display_name, car2_racer_name) = car2

            race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict,
                                                car1=car1,
                                                car2=car2,
                                                race_idx=race_idx)

            race_car_colors = RACE_CAR_COLORS
            race_model_s3_buckets = [
                car1_model_s3_bucket, car2_model_s3_bucket
            ]
            race_model_metadatas = [car1_model_metadata, car2_model_metadata]

            # List of directories created
            dirs_to_delete = list()
            yaml_dir = os.path.abspath(os.path.join(os.getcwd(),
                                                    str(race_idx)))
            os.makedirs(yaml_dir)

            dirs_to_delete.append(yaml_dir)
            race_yaml_path = os.path.abspath(
                os.path.join(yaml_dir, 'evaluation_params.yaml'))
            with open(race_yaml_path, 'w') as race_yaml_file:
                yaml.dump(race_yaml_dict, race_yaml_file)

            # List of racecar names that should include second camera while launching
            racecars_with_stereo_cameras = list()
            # List of racecar names that should include lidar while launching
            racecars_with_lidars = list()
            # List of SimApp versions
            simapp_versions = list()
            for agent_index, model_s3_bucket in enumerate(
                    race_model_s3_buckets):
                racecar_name = 'racecar_' + str(agent_index)
                # Make a local folder with the racecar name to download the model_metadata.json
                os.makedirs(os.path.join(os.getcwd(), racecar_name))
                dirs_to_delete.append(os.path.join(os.getcwd(), racecar_name))
                local_model_metadata_path = os.path.abspath(
                    os.path.join(os.path.join(os.getcwd(), racecar_name),
                                 'model_metadata.json'))
                json_key = race_model_metadatas[agent_index]
                json_key = json_key.replace('s3://{}/'.format(model_s3_bucket),
                                            '')
                try:
                    s3_client.download_file(Bucket=model_s3_bucket,
                                            Key=json_key,
                                            Filename=local_model_metadata_path)
                except Exception as e:
                    log_and_exit(
                        "Failed to download model_metadata file: s3_bucket: {}, yaml_key: {}, {}"
                        .format(model_s3_bucket, json_key,
                                e), SIMAPP_SIMULATION_WORKER_EXCEPTION,
                        SIMAPP_EVENT_ERROR_CODE_500)
                sensors, _, simapp_version = utils_parse_model_metadata.parse_model_metadata(
                    local_model_metadata_path)
                simapp_versions.append(simapp_version)
                if Input.STEREO.value in sensors:
                    racecars_with_stereo_cameras.append(racecar_name)
                if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors:
                    racecars_with_lidars.append(racecar_name)

            cmd = [
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "tournament_race_node.py"),
                str(race_idx), race_yaml_path,
                ','.join(racecars_with_stereo_cameras),
                ','.join(racecars_with_lidars), ','.join(race_car_colors),
                ','.join(simapp_versions)
            ]
            try:
                return_code, _, stderr = run_cmd(cmd_args=cmd,
                                                 shell=False,
                                                 stdout=None,
                                                 stderr=None)
            except KeyboardInterrupt:
                logger.info(
                    "KeyboardInterrupt raised, SimApp must be faulted! exiting..."
                )
                return

            # Retrieve winner and append tournament report
            with open('race_report.pkl', 'rb') as f:
                race_report = pickle.load(f)
            race_report['race_idx'] = race_idx
            winner = car1 if race_report[
                'winner'] == car1_display_name else car2
            logger.info("race {}'s winner: {}".format(race_idx,
                                                      race_report['winner']))

            tournament_candidate_queue.append(winner)
            tournament_report["race_results"].append(race_report)

            # Clean up directories created
            for dir_to_delete in dirs_to_delete:
                shutil.rmtree(dir_to_delete, ignore_errors=True)
            race_idx += 1

            s3_extra_args = get_s3_kms_extra_args()
            # Persist latest queue and report to use after job restarts.
            with open(local_queue_pickle_path, 'wb') as f:
                pickle.dump(tournament_candidate_queue, f, protocol=2)
            s3_client.upload_file(Filename=local_queue_pickle_path,
                                  Bucket=s3_bucket,
                                  Key=queue_pickle_s3_key,
                                  ExtraArgs=s3_extra_args)

            with open(local_report_pickle_path, 'wb') as f:
                pickle.dump(tournament_report, f, protocol=2)
            s3_client.upload_file(Filename=local_report_pickle_path,
                                  Bucket=s3_bucket,
                                  Key=report_pickle_s3_key,
                                  ExtraArgs=s3_extra_args)

            # If there is more than 1 candidates then restart the simulation job otherwise
            # tournament is finished, persists final report and ends the job.
            if len(tournament_candidate_queue) > 1:
                restart_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
                break
            else:
                # Persist final tournament report in json format
                # and terminate the job by canceling it
                s3_client.put_object(Bucket=s3_bucket,
                                     Key=final_report_s3_key,
                                     Body=json.dumps(tournament_report),
                                     **s3_extra_args)

                cancel_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
    except Exception as e:
        log_and_exit("Tournament node failed: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_500)
コード例 #18
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk',
                        '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument(
        '-ek',
        '--environment_s3_key',
        help="(string) Name of an environment file to download from S3",
        type=str,
        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument(
        '-c',
        '--checkpoint_dir',
        help=
        '(string) Path to a folder containing a checkpoint to write the model to.',
        type=str,
        default='./checkpoint')
    parser.add_argument(
        '--pretrained_checkpoint_dir',
        help='(string) Path to a folder for downloading a pre-trained model',
        type=str,
        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get(
                            "SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()

    s3_client = S3Client(region_name=args.aws_region, max_retry_attempts=0)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata_download = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=args.model_metadata_s3_key,
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_info = model_metadata_download.get_model_metadata_info()
    network_type = model_metadata_info[ModelMetadataKeys.NEURAL_NETWORK.value]
    version = model_metadata_info[ModelMetadataKeys.VERSION.value]

    # upload model metadata
    model_metadata_upload = ModelMetadata(
        bucket=args.s3_bucket,
        s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX),
        region_name=args.aws_region,
        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_upload.persist(
        s3_kms_extra_args=utils.get_s3_kms_extra_args())

    shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        try:
            s3_client.download_file(bucket=args.s3_bucket,
                                    s3_key=args.preset_s3_key,
                                    local_path=preset_local_path)
            success_custom_preset = True
        except botocore.exceptions.ClientError:
            pass
        if not success_custom_preset:
            logger.info(
                "Could not download the preset file. Using the default DeepRacer preset."
            )
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location,
                                                 ignore_module_case=True)
            s3_client.upload_file(
                bucket=args.s3_bucket,
                s3_key=os.path.normpath("%s/presets/preset.py" %
                                        args.s3_prefix),
                local_path=preset_local_path,
                s3_kms_extra_args=utils.get_s3_kms_extra_args())
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {
            'model_metadata': model_metadata_download,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [],
                ConfigParams.VELOCITY_LIST.value: {},
                ConfigParams.STEERING_LIST.value: {},
                ConfigParams.CHANGE_START.value: None,
                ConfigParams.ALT_DIR.value: None,
                ConfigParams.MODEL_METADATA.value: model_metadata_download,
                ConfigParams.REWARD.value: None,
                ConfigParams.AGENT_NAME.value: 'racecar'
            }
        }

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(
            hp_dict=sm_hyperparams_dict,
            agent_list=agent_list,
            run_phase_subject=None,
            run_type=str(RunType.TRAINER))

        # Upload hyperparameters to SageMaker shared s3 bucket
        hyperparameters = Hyperparameters(bucket=args.s3_bucket,
                                          s3_key=get_s3_key(
                                              args.s3_prefix,
                                              HYPERPARAMETER_S3_POSTFIX),
                                          region_name=args.aws_region)
        hyperparameters.persist(
            hyperparams_json=robomaker_hyperparams_json,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(
                bucket=args.s3_bucket,
                s3_prefix=args.s3_prefix,
                region_name=args.aws_region,
                max_sample_count=max_sample_count,
                sampling_frequency=int(
                    sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    # persist IP config from sagemaker to s3
    ip_config = IpConfig(bucket=args.s3_bucket,
                         s3_prefix=args.s3_prefix,
                         region_name=args.aws_region)
    ip_config.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args())

    training_algorithm = model_metadata_download.training_algorithm
    output_head_format = FROZEN_HEAD_OUTPUT_GRAPH_FORMAT_MAPPING[
        training_algorithm]

    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    if use_pretrained_model:
        # checkpoint s3 instance for pretrained model
        # TODO: replace 'agent' for multiagent training
        checkpoint = Checkpoint(bucket=args.pretrained_s3_bucket,
                                s3_prefix=args.pretrained_s3_prefix,
                                region_name=args.aws_region,
                                agent_name='agent',
                                checkpoint_dir=args.pretrained_checkpoint_dir,
                                output_head_format=output_head_format)
        # make coach checkpoint compatible
        if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
        ):
            checkpoint.rl_coach_checkpoint.make_compatible(
                checkpoint.syncfile_ready)
        # get best model checkpoint string
        model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        # Select the best checkpoint model by uploading rl coach .coach_checkpoint file
        checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())
        # add checkpoint into checkpoint_dict
        checkpoint_dict = {'agent': checkpoint}
        # load pretrained model
        ds_params_instance_pretrained = S3BotoDataStoreParameters(
            checkpoint_dict=checkpoint_dict)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained,
                                                graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(
        redis_address="localhost",
        redis_port=6379,
        run_type=str(RunType.TRAINER),
        channel=args.s3_prefix,
        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    # checkpoint s3 instance for training model
    checkpoint = Checkpoint(bucket=args.s3_bucket,
                            s3_prefix=args.s3_prefix,
                            region_name=args.aws_region,
                            agent_name='agent',
                            checkpoint_dir=args.checkpoint_dir,
                            output_head_format=output_head_format)
    checkpoint_dict = {'agent': checkpoint}
    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance,
                                               graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(
            robomaker_hyperparams_json)["num_episodes_between_training"],
        training_algorithm=training_algorithm)
コード例 #19
0
 def upload_finished_file(self):
     for _, checkpoint in self.params.checkpoint_dict.items():
         checkpoint.syncfile_finished.persist(
             s3_kms_extra_args=get_s3_kms_extra_args())
コード例 #20
0
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters,
                   simtrace_video_s3_writers, pause_physics, unpause_physics):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not graph_manager.data_store:
        raise AttributeError("None type for data_store object")

    data_store = graph_manager.data_store

    #TODO change agent to specific agent name for multip agent case
    checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path,
                                  "agent")
    graph_manager.data_store.wait_for_checkpoints()
    graph_manager.data_store.wait_for_trainer_ready()
    # wait for the required cancel services to become available
    rospy.wait_for_service('/robomaker/job/cancel')
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4')
    rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4')

    subscribe_to_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/subscribe_to_save_mp4', Empty)
    unsubscribe_from_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    chkpt_state_reader = CheckpointStateReader(checkpoint_dir,
                                               checkpoint_state_optional=False)
    last_checkpoint = chkpt_state_reader.get_latest().num

    # this worker should play a fraction of the total playing steps per rollout
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    act_steps = int(episode_steps_per_rollout / num_workers)
    if rollout_idx < episode_steps_per_rollout % num_workers:
        act_steps += 1
    act_steps = EnvironmentEpisodes(act_steps)

    configure_environment_randomizer()

    for _ in range(
        (graph_manager.improve_steps / act_steps.num_steps).num_steps):
        # Collect profiler information only IS_PROFILER_ON is true
        with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                            s3_prefix=PROFILER_S3_PREFIX,
                            output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                            enable_profiling=IS_PROFILER_ON):
            graph_manager.phase = RunPhase.TRAIN
            exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers,
                                 rollout_idx)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(act_steps,
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            graph_manager.phase = RunPhase.UNDEFINED
            new_checkpoint = -1
            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.SYNC:
                unpause_physics(EmptyRequest())
                is_save_mp4_enabled = rospy.get_param(
                    'MP4_S3_BUCKET', None) and rollout_idx == 0
                if is_save_mp4_enabled:
                    subscribe_to_save_mp4(EmptyRequest())
                if rollout_idx == 0:
                    for _ in range(MIN_EVAL_TRIALS):
                        graph_manager.evaluate(EnvironmentSteps(1))

                while new_checkpoint < last_checkpoint + 1:
                    exit_if_trainer_done(checkpoint_dir,
                                         simtrace_video_s3_writers,
                                         rollout_idx)
                    if rollout_idx == 0:
                        graph_manager.evaluate(EnvironmentSteps(1))
                    new_checkpoint = data_store.get_coach_checkpoint_number(
                        'agent')
                if is_save_mp4_enabled:
                    unsubscribe_from_save_mp4(EmptyRequest())
                # upload simtrace and mp4 into s3 bucket
                for s3_writer in simtrace_video_s3_writers:
                    s3_writer.persist(utils.get_s3_kms_extra_args())
                pause_physics(EmptyRequest())
                data_store.load_from_store(
                    expected_checkpoint_number=last_checkpoint + 1)
                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            last_checkpoint = new_checkpoint
コード例 #21
0
def evaluation_worker(graph_manager, number_of_trials, task_parameters,
                      simtrace_video_s3_writers, is_continuous,
                      park_positions):
    """ Evaluation worker function

    Arguments:
        graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager
        number_of_trials(int): Number of trails you want to run the evaluation
        task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu,
            framework etc of rlcoach
        simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4
        is_continuous(bool): The termination condition for the car
        park_positions(list of tuple): list of (x, y) for cars to park at
    """
    # Collect profiler information only IS_PROFILER_ON is true
    with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                        s3_prefix=PROFILER_S3_PREFIX,
                        output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                        enable_profiling=IS_PROFILER_ON):
        subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(
        ), list()
        subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
        for agent_param in graph_manager.agents_params:
            racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
                                     else "racecar_{}".format(agent_param.name.split("_")[1])
            subscribe_to_save_mp4_topic.append(
                "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
            unsubscribe_from_save_mp4_topic.append(
                "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
        graph_manager.data_store.wait_for_checkpoints()
        graph_manager.data_store.modify_checkpoint_variables()

        # Make the clients that will allow us to pause and unpause the physics
        rospy.wait_for_service('/gazebo/pause_physics_dr')
        rospy.wait_for_service('/gazebo/unpause_physics_dr')
        pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
        unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr',
                                              Empty)

        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            rospy.wait_for_service(mp4_sub)
            rospy.wait_for_service(mp4_unsub)
        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
            unsubscribe_from_save_mp4.append(
                ServiceProxyWrapper(mp4_unsub, Empty))

        graph_manager.create_graph(task_parameters=task_parameters,
                                   stop_physics=pause_physics,
                                   start_physics=unpause_physics,
                                   empty_service_call=EmptyRequest)
        logger.info(
            "Graph manager successfully created the graph: Unpausing physics")
        unpause_physics(EmptyRequest())

        is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
        if is_save_mp4_enabled:
            for subscribe_mp4 in subscribe_to_save_mp4:
                subscribe_mp4(EmptyRequest())

        configure_environment_randomizer()
        track_data = TrackData.get_instance()

        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        if is_continuous:
            track_data.park_positions = park_positions
            graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(number_of_trials):
                track_data.park_positions = park_positions
                graph_manager.evaluate(EnvironmentSteps(1))
        if is_save_mp4_enabled:
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4(EmptyRequest())
        # upload simtrace and mp4 into s3 bucket
        for s3_writer in simtrace_video_s3_writers:
            s3_writer.persist(utils.get_s3_kms_extra_args())
        time.sleep(1)
        pause_physics(EmptyRequest())

    # Close the down the job
    utils.cancel_simulation_job(
        os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
        rospy.get_param('AWS_REGION'))
コード例 #22
0
def main():
    """ Main function for evaluation worker """
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--preset',
                        help="(string) Name of a preset to run \
                             (class name from the 'presets' directory.)",
                        type=str,
                        required=False)
    parser.add_argument('--s3_bucket',
                        help='list(string) S3 bucket',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_BUCKET",
                                                ["gsaur-test"]))
    parser.add_argument('--s3_prefix',
                        help='list(string) S3 prefix',
                        type=str,
                        nargs='+',
                        default=rospy.get_param("MODEL_S3_PREFIX",
                                                ["sagemaker"]))
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=rospy.get_param("AWS_REGION", "us-east-1"))
    parser.add_argument('--number_of_trials',
                        help='(integer) Number of trials',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_TRIALS", 10)))
    parser.add_argument(
        '-c',
        '--local_model_directory',
        help='(string) Path to a folder containing a checkpoint \
                             to restore the model from.',
        type=str,
        default='./checkpoint')
    parser.add_argument('--number_of_resets',
                        help='(integer) Number of resets',
                        type=int,
                        default=int(rospy.get_param("NUMBER_OF_RESETS", 0)))
    parser.add_argument('--penalty_seconds',
                        help='(float) penalty second',
                        type=float,
                        default=float(rospy.get_param("PENALTY_SECONDS", 2.0)))
    parser.add_argument('--job_type',
                        help='(string) job type',
                        type=str,
                        default=rospy.get_param("JOB_TYPE", "EVALUATION"))
    parser.add_argument('--is_continuous',
                        help='(boolean) is continous after lap completion',
                        type=bool,
                        default=utils.str2bool(
                            rospy.get_param("IS_CONTINUOUS", False)))
    parser.add_argument('--race_type',
                        help='(string) Race type',
                        type=str,
                        default=rospy.get_param("RACE_TYPE", "TIME_TRIAL"))
    parser.add_argument('--off_track_penalty',
                        help='(float) off track penalty second',
                        type=float,
                        default=float(rospy.get_param("OFF_TRACK_PENALTY",
                                                      2.0)))
    parser.add_argument('--collision_penalty',
                        help='(float) collision penalty second',
                        type=float,
                        default=float(rospy.get_param("COLLISION_PENALTY",
                                                      5.0)))

    args = parser.parse_args()
    arg_s3_bucket = args.s3_bucket
    arg_s3_prefix = args.s3_prefix
    logger.info("S3 bucket: %s \n S3 prefix: %s", arg_s3_bucket, arg_s3_prefix)

    metrics_s3_buckets = rospy.get_param('METRICS_S3_BUCKET')
    metrics_s3_object_keys = rospy.get_param('METRICS_S3_OBJECT_KEY')

    arg_s3_bucket, arg_s3_prefix = utils.force_list(
        arg_s3_bucket), utils.force_list(arg_s3_prefix)
    metrics_s3_buckets = utils.force_list(metrics_s3_buckets)
    metrics_s3_object_keys = utils.force_list(metrics_s3_object_keys)

    validate_list = [
        arg_s3_bucket, arg_s3_prefix, metrics_s3_buckets,
        metrics_s3_object_keys
    ]

    simtrace_s3_bucket = rospy.get_param('SIMTRACE_S3_BUCKET', None)
    mp4_s3_bucket = rospy.get_param('MP4_S3_BUCKET', None)
    if simtrace_s3_bucket:
        simtrace_s3_object_prefix = rospy.get_param('SIMTRACE_S3_PREFIX')
        simtrace_s3_bucket = utils.force_list(simtrace_s3_bucket)
        simtrace_s3_object_prefix = utils.force_list(simtrace_s3_object_prefix)
        validate_list.extend([simtrace_s3_bucket, simtrace_s3_object_prefix])
    if mp4_s3_bucket:
        mp4_s3_object_prefix = rospy.get_param('MP4_S3_OBJECT_PREFIX')
        mp4_s3_bucket = utils.force_list(mp4_s3_bucket)
        mp4_s3_object_prefix = utils.force_list(mp4_s3_object_prefix)
        validate_list.extend([mp4_s3_bucket, mp4_s3_object_prefix])

    if not all([lambda x: len(x) == len(validate_list[0]), validate_list]):
        log_and_exit(
            "Eval worker error: Incorrect arguments passed: {}".format(
                validate_list), SIMAPP_SIMULATION_WORKER_EXCEPTION,
            SIMAPP_EVENT_ERROR_CODE_500)
    if args.number_of_resets != 0 and args.number_of_resets < MIN_RESET_COUNT:
        raise GenericRolloutException(
            "number of resets is less than {}".format(MIN_RESET_COUNT))

    # Instantiate Cameras
    if len(arg_s3_bucket) == 1:
        configure_camera(namespaces=['racecar'])
    else:
        configure_camera(namespaces=[
            'racecar_{}'.format(str(agent_index))
            for agent_index in range(len(arg_s3_bucket))
        ])

    agent_list = list()
    s3_bucket_dict = dict()
    s3_prefix_dict = dict()
    checkpoint_dict = dict()
    simtrace_video_s3_writers = []
    start_positions = get_start_positions(len(arg_s3_bucket))
    done_condition = utils.str_to_done_condition(
        rospy.get_param("DONE_CONDITION", any))
    park_positions = utils.pos_2d_str_to_list(
        rospy.get_param("PARK_POSITIONS", []))
    # if not pass in park positions for all done condition case, use default
    if not park_positions:
        park_positions = [DEFAULT_PARK_POSITION for _ in arg_s3_bucket]
    for agent_index, _ in enumerate(arg_s3_bucket):
        agent_name = 'agent' if len(arg_s3_bucket) == 1 else 'agent_{}'.format(
            str(agent_index))
        racecar_name = 'racecar' if len(
            arg_s3_bucket) == 1 else 'racecar_{}'.format(str(agent_index))
        s3_bucket_dict[agent_name] = arg_s3_bucket[agent_index]
        s3_prefix_dict[agent_name] = arg_s3_prefix[agent_index]

        # download model metadata
        model_metadata = ModelMetadata(
            bucket=arg_s3_bucket[agent_index],
            s3_key=get_s3_key(arg_s3_prefix[agent_index],
                              MODEL_METADATA_S3_POSTFIX),
            region_name=args.aws_region,
            local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(agent_name))
        model_metadata_info = model_metadata.get_model_metadata_info()
        version = model_metadata_info[ModelMetadataKeys.VERSION.value]

        # checkpoint s3 instance
        checkpoint = Checkpoint(bucket=arg_s3_bucket[agent_index],
                                s3_prefix=arg_s3_prefix[agent_index],
                                region_name=args.aws_region,
                                agent_name=agent_name,
                                checkpoint_dir=args.local_model_directory)
        # make coach checkpoint compatible
        if version < SIMAPP_VERSION_2 and not checkpoint.rl_coach_checkpoint.is_compatible(
        ):
            checkpoint.rl_coach_checkpoint.make_compatible(
                checkpoint.syncfile_ready)
        # get best model checkpoint string
        model_checkpoint_name = checkpoint.deepracer_checkpoint_json.get_deepracer_best_checkpoint(
        )
        # Select the best checkpoint model by uploading rl coach .coach_checkpoint file
        checkpoint.rl_coach_checkpoint.update(
            model_checkpoint_name=model_checkpoint_name,
            s3_kms_extra_args=utils.get_s3_kms_extra_args())

        checkpoint_dict[agent_name] = checkpoint

        agent_config = {
            'model_metadata': model_metadata,
            ConfigParams.CAR_CTRL_CONFIG.value: {
                ConfigParams.LINK_NAME_LIST.value: [
                    link_name.replace('racecar', racecar_name)
                    for link_name in LINK_NAMES
                ],
                ConfigParams.VELOCITY_LIST.value: [
                    velocity_topic.replace('racecar', racecar_name)
                    for velocity_topic in VELOCITY_TOPICS
                ],
                ConfigParams.STEERING_LIST.value: [
                    steering_topic.replace('racecar', racecar_name)
                    for steering_topic in STEERING_TOPICS
                ],
                ConfigParams.CHANGE_START.value:
                utils.str2bool(rospy.get_param('CHANGE_START_POSITION',
                                               False)),
                ConfigParams.ALT_DIR.value:
                utils.str2bool(
                    rospy.get_param('ALTERNATE_DRIVING_DIRECTION', False)),
                ConfigParams.MODEL_METADATA.value:
                model_metadata,
                ConfigParams.REWARD.value:
                reward_function,
                ConfigParams.AGENT_NAME.value:
                racecar_name,
                ConfigParams.VERSION.value:
                version,
                ConfigParams.NUMBER_OF_RESETS.value:
                args.number_of_resets,
                ConfigParams.PENALTY_SECONDS.value:
                args.penalty_seconds,
                ConfigParams.NUMBER_OF_TRIALS.value:
                args.number_of_trials,
                ConfigParams.IS_CONTINUOUS.value:
                args.is_continuous,
                ConfigParams.RACE_TYPE.value:
                args.race_type,
                ConfigParams.COLLISION_PENALTY.value:
                args.collision_penalty,
                ConfigParams.OFF_TRACK_PENALTY.value:
                args.off_track_penalty,
                ConfigParams.START_POSITION.value:
                start_positions[agent_index],
                ConfigParams.DONE_CONDITION.value:
                done_condition
            }
        }

        metrics_s3_config = {
            MetricsS3Keys.METRICS_BUCKET.value:
            metrics_s3_buckets[agent_index],
            MetricsS3Keys.METRICS_KEY.value:
            metrics_s3_object_keys[agent_index],
            # Replaced rospy.get_param('AWS_REGION') to be equal to the argument being passed
            # or default argument set
            MetricsS3Keys.REGION.value:
            args.aws_region
        }
        aws_region = rospy.get_param('AWS_REGION', args.aws_region)

        if simtrace_s3_bucket:
            simtrace_video_s3_writers.append(
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.SIMTRACE_EVAL.value,
                    bucket=simtrace_s3_bucket[agent_index],
                    s3_prefix=simtrace_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=SIMTRACE_EVAL_LOCAL_PATH_FORMAT.format(
                        agent_name)))
        if mp4_s3_bucket:
            simtrace_video_s3_writers.extend([
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.PIP.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_PIP_MP4_LOCAL_PATH_FORMAT.format(
                        agent_name)),
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.DEGREE45.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_45DEGREE_LOCAL_PATH_FORMAT.format(
                        agent_name)),
                SimtraceVideo(
                    upload_type=SimtraceVideoNames.TOPVIEW.value,
                    bucket=mp4_s3_bucket[agent_index],
                    s3_prefix=mp4_s3_object_prefix[agent_index],
                    region_name=aws_region,
                    local_path=CAMERA_TOPVIEW_LOCAL_PATH_FORMAT.format(
                        agent_name))
            ])

        run_phase_subject = RunPhaseSubject()
        agent_list.append(
            create_rollout_agent(
                agent_config,
                EvalMetrics(agent_name, metrics_s3_config, args.is_continuous),
                run_phase_subject))
    agent_list.append(create_obstacles_agent())
    agent_list.append(create_bot_cars_agent())

    # ROS service to indicate all the robomaker markov packages are ready for consumption
    signal_robomaker_markov_package_ready()

    PhaseObserver('/agent/training_phase', run_phase_subject)
    enable_domain_randomization = utils.str2bool(
        rospy.get_param('ENABLE_DOMAIN_RANDOMIZATION', False))

    sm_hyperparams_dict = {}

    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)

    graph_manager, _ = get_graph_manager(
        hp_dict=sm_hyperparams_dict,
        agent_list=agent_list,
        run_phase_subject=run_phase_subject,
        enable_domain_randomization=enable_domain_randomization,
        done_condition=done_condition,
        pause_physics=pause_physics,
        unpause_physics=unpause_physics)

    ds_params_instance = S3BotoDataStoreParameters(
        checkpoint_dict=checkpoint_dict)

    graph_manager.data_store = S3BotoDataStore(params=ds_params_instance,
                                               graph_manager=graph_manager,
                                               ignore_lock=True)
    graph_manager.env_params.seed = 0

    task_parameters = TaskParameters()
    task_parameters.checkpoint_restore_path = args.local_model_directory

    evaluation_worker(graph_manager=graph_manager,
                      number_of_trials=args.number_of_trials,
                      task_parameters=task_parameters,
                      simtrace_video_s3_writers=simtrace_video_s3_writers,
                      is_continuous=args.is_continuous,
                      park_positions=park_positions,
                      race_type=args.race_type,
                      pause_physics=pause_physics,
                      unpause_physics=unpause_physics)
コード例 #23
0
 def upload_episode_metrics(self):
     json_metrics = json.dumps({'metrics': self._metrics_})
     self._s3_metrics.persist(body=json_metrics,
                              s3_kms_extra_args=get_s3_kms_extra_args())
     if self._is_eval_:
         self._current_eval_pct_list_.append(self._progress_)
コード例 #24
0
    def __init__(self,
                 queue_url,
                 aws_region='us-east-1',
                 race_duration=180,
                 number_of_trials=3,
                 number_of_resets=10000,
                 penalty_seconds=2.0,
                 off_track_penalty=2.0,
                 collision_penalty=5.0,
                 is_continuous=False,
                 race_type="TIME_TRIAL"):
        # constructor arguments
        self._model_updater = ModelUpdater.get_instance()
        self._deepracer_path = rospkg.RosPack().get_path(
            DeepRacerPackages.DEEPRACER_SIMULATION_ENVIRONMENT)
        body_shell_path = os.path.join(self._deepracer_path, "meshes", "f1")
        self._valid_body_shells = \
            set(".".join(f.split(".")[:-1]) for f in os.listdir(body_shell_path) if os.path.isfile(
                os.path.join(body_shell_path, f)))
        self._valid_body_shells.add(const.BodyShellType.DEFAULT.value)
        self._valid_car_colors = set(e.value for e in const.CarColorType
                                     if "f1" not in e.value)
        self._num_sectors = int(rospy.get_param("NUM_SECTORS", "3"))
        self._queue_url = queue_url
        self._region = aws_region
        self._number_of_trials = number_of_trials
        self._number_of_resets = number_of_resets
        self._penalty_seconds = penalty_seconds
        self._off_track_penalty = off_track_penalty
        self._collision_penalty = collision_penalty
        self._is_continuous = is_continuous
        self._race_type = race_type
        self._is_save_simtrace_enabled = False
        self._is_save_mp4_enabled = False
        self._is_event_end = False
        self._done_condition = any
        self._race_duration = race_duration
        self._enable_domain_randomization = False

        # sqs client
        # The boto client errors out after polling for 1 hour.
        self._sqs_client = SQSClient(queue_url=self._queue_url,
                                     region_name=self._region,
                                     max_num_of_msg=MAX_NUM_OF_SQS_MESSAGE,
                                     wait_time_sec=SQS_WAIT_TIME_SEC,
                                     session=refreshed_session(self._region))
        self._s3_client = S3Client(region_name=self._region)
        # tracking current state information
        self._track_data = TrackData.get_instance()
        self._start_lane = self._track_data.center_line
        # keep track of the racer specific info, e.g. s3 locations, alias, car color etc.
        self._current_racer = None
        # keep track of the current race car we are using. It is always "racecar".
        car_model_state = ModelState()
        car_model_state.model_name = "racecar"
        self._current_car_model_state = car_model_state
        self._last_body_shell_type = None
        self._last_sensors = None
        self._racecar_model = AgentModel()
        # keep track of the current control agent we are using
        self._current_agent = None
        # keep track of the current control graph manager
        self._current_graph_manager = None
        # Keep track of previous model's name
        self._prev_model_name = None
        self._hide_position_idx = 0
        self._hide_positions = get_hide_positions(race_car_num=1)
        self._run_phase_subject = RunPhaseSubject()
        self._simtrace_video_s3_writers = []

        self._local_model_directory = './checkpoint'

        # virtual event only have single agent, so set agent_name to "agent"
        self._agent_name = "agent"

        # camera manager
        self._camera_manager = CameraManager.get_instance()

        # setting up virtual event top and follow camera in CameraManager
        # virtual event configure camera does not need to wait for car to spawm because
        # follow car camera is not tracking any car initially
        self._main_cameras, self._sub_camera = configure_camera(
            namespaces=[VIRTUAL_EVENT], is_wait_for_model=False)
        self._spawn_cameras()

        # pop out all cameras after configuration to prevent camera from moving
        self._camera_manager.pop(namespace=VIRTUAL_EVENT)

        dummy_metrics_s3_config = {
            MetricsS3Keys.METRICS_BUCKET.value: "dummy-bucket",
            MetricsS3Keys.METRICS_KEY.value: "dummy-key",
            MetricsS3Keys.REGION.value: self._region
        }

        self._eval_metrics = EvalMetrics(
            agent_name=self._agent_name,
            s3_dict_metrics=dummy_metrics_s3_config,
            is_continuous=self._is_continuous,
            pause_time_before_start=PAUSE_TIME_BEFORE_START)

        # upload a default best sector time for all sectors with time inf for each sector
        # if there is not best sector time existed in s3

        # use the s3 bucket and prefix for yaml file stored as environment variable because
        # here is SimApp use only. For virtual event there is no s3 bucket and prefix past
        # through yaml file. All are past through sqs. For simplicity, reuse the yaml s3 bucket
        # and prefix environment variable.
        virtual_event_best_sector_time = VirtualEventBestSectorTime(
            bucket=os.environ.get("YAML_S3_BUCKET", ''),
            s3_key=get_s3_key(os.environ.get("YAML_S3_PREFIX", ''),
                              SECTOR_TIME_S3_POSTFIX),
            region_name=os.environ.get("APP_REGION", "us-east-1"),
            local_path=SECTOR_TIME_LOCAL_PATH)
        response = virtual_event_best_sector_time.list()
        # this is used to handle situation such as robomaker job crash, so the next robomaker job
        # can catch the best sector time left over from crashed job
        if "Contents" not in response:
            virtual_event_best_sector_time.persist(
                body=json.dumps({
                    SECTOR_X_FORMAT.format(idx + 1): float("inf")
                    for idx in range(self._num_sectors)
                }),
                s3_kms_extra_args=utils.get_s3_kms_extra_args())

        # ROS service to indicate all the robomaker markov packages are ready for consumption
        signal_robomaker_markov_package_ready()

        PhaseObserver('/agent/training_phase', self._run_phase_subject)

        # setup mp4 services
        self._setup_mp4_services()
コード例 #25
0
 def upload_episode_metrics(self):
     json_metrics = json.dumps({'metrics': self._metrics_})
     self._s3_metrics.persist(body=json_metrics,
                              s3_kms_extra_args=get_s3_kms_extra_args())
コード例 #26
0
def evaluation_worker(graph_manager, number_of_trials, task_parameters,
                      simtrace_video_s3_writers, is_continuous, park_positions,
                      race_type, pause_physics, unpause_physics):
    """ Evaluation worker function

    Arguments:
        graph_manager(MultiAgentGraphManager): Graph manager of multiagent graph manager
        number_of_trials(int): Number of trails you want to run the evaluation
        task_parameters(TaskParameters): Information of the checkpoint, gpu/cpu,
            framework etc of rlcoach
        simtrace_video_s3_writers(list): Information to upload to the S3 bucket all the simtrace and mp4
        is_continuous(bool): The termination condition for the car
        park_positions(list of tuple): list of (x, y) for cars to park at
        race_type (str): race type
    """
    # Collect profiler information only IS_PROFILER_ON is true
    with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                        s3_prefix=PROFILER_S3_PREFIX,
                        output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                        enable_profiling=IS_PROFILER_ON):
        subscribe_to_save_mp4_topic, unsubscribe_from_save_mp4_topic = list(
        ), list()
        subscribe_to_save_mp4, unsubscribe_from_save_mp4 = list(), list()
        for agent_param in graph_manager.agents_params:
            racecar_name = 'racecar' if len(agent_param.name.split("_")) == 1 \
                                     else "racecar_{}".format(agent_param.name.split("_")[1])
            subscribe_to_save_mp4_topic.append(
                "/{}/save_mp4/subscribe_to_save_mp4".format(racecar_name))
            unsubscribe_from_save_mp4_topic.append(
                "/{}/save_mp4/unsubscribe_from_save_mp4".format(racecar_name))
        graph_manager.data_store.wait_for_checkpoints()
        graph_manager.data_store.modify_checkpoint_variables()

        # wait for the required cancel services to become available
        if race_type != RaceType.F1.value:
            # TODO: Since we are not running Grand Prix in RoboMaker,
            # we are opting out from waiting for RoboMaker's cancel job service
            # in case of Grand Prix execution.
            # Otherwise, SimApp will hang as service will never come alive.
            #
            # If we don't depend on RoboMaker anymore in the future,
            # we need to remove below line, or do a better job to figure out
            # whether we are running on RoboMaker or not to decide whether
            # we should wait for below service or not.
            rospy.wait_for_service('/robomaker/job/cancel')

        # Make the clients that will allow us to pause and unpause the physics
        rospy.wait_for_service('/gazebo/pause_physics_dr')
        rospy.wait_for_service('/gazebo/unpause_physics_dr')
        pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
        unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr',
                                              Empty)

        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            rospy.wait_for_service(mp4_sub)
            rospy.wait_for_service(mp4_unsub)
        for mp4_sub, mp4_unsub in zip(subscribe_to_save_mp4_topic,
                                      unsubscribe_from_save_mp4_topic):
            subscribe_to_save_mp4.append(ServiceProxyWrapper(mp4_sub, Empty))
            unsubscribe_from_save_mp4.append(
                Thread(target=ServiceProxyWrapper(mp4_unsub, Empty),
                       args=(EmptyRequest(), )))

        graph_manager.create_graph(task_parameters=task_parameters,
                                   stop_physics=pause_physics,
                                   start_physics=unpause_physics,
                                   empty_service_call=EmptyRequest)
        logger.info(
            "Graph manager successfully created the graph: Unpausing physics")
        unpause_physics(EmptyRequest())

        is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None)
        if is_save_mp4_enabled:
            for subscribe_mp4 in subscribe_to_save_mp4:
                subscribe_mp4(EmptyRequest())

        configure_environment_randomizer()
        track_data = TrackData.get_instance()

        # Before each evaluation episode (single lap for non-continuous race and complete race for
        # continuous race), a new copy of park_positions needs to be loaded into track_data because
        # a park position will be pop from park_positions when a racer car need to be parked.
        if is_continuous:
            track_data.park_positions = park_positions
            graph_manager.evaluate(EnvironmentSteps(1))
        else:
            for _ in range(number_of_trials):
                track_data.park_positions = park_positions
                graph_manager.evaluate(EnvironmentSteps(1))
        if is_save_mp4_enabled:
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4.start()
            for unsubscribe_mp4 in unsubscribe_from_save_mp4:
                unsubscribe_mp4.join()
        # upload simtrace and mp4 into s3 bucket
        for s3_writer in simtrace_video_s3_writers:
            s3_writer.persist(utils.get_s3_kms_extra_args())
        time.sleep(1)
        pause_physics(EmptyRequest())

    if race_type != RaceType.F1.value:
        # Close the down the job
        utils.cancel_simulation_job()
コード例 #27
0
def main():
    """ Main function for tournament"""
    try:
        # parse argument
        s3_region = sys.argv[1]
        s3_bucket = sys.argv[2]
        s3_prefix = sys.argv[3]
        s3_yaml_name = sys.argv[4]

        # create boto3 session/client and download yaml/json file
        session = boto3.session.Session()
        s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL", None)
        s3_client = S3Client(region_name=s3_region,
                             s3_endpoint_url=s3_endpoint_url)

        # Intermediate tournament files
        queue_pickle_name = 'tournament_candidate_queue.pkl'
        queue_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, queue_pickle_name))
        local_queue_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), queue_pickle_name))

        report_pickle_name = 'tournament_report.pkl'
        report_pickle_s3_key = os.path.normpath(
            os.path.join(s3_prefix, report_pickle_name))
        local_report_pickle_path = os.path.abspath(
            os.path.join(os.getcwd(), report_pickle_name))

        final_report_name = 'tournament_report.json'
        final_report_s3_key = os.path.normpath(
            os.path.join(s3_prefix, final_report_name))

        try:
            s3_client.download_file(bucket=s3_bucket,
                                    s3_key=queue_pickle_s3_key,
                                    local_path=local_queue_pickle_path)

            s3_client.download_file(bucket=s3_bucket,
                                    s3_key=report_pickle_s3_key,
                                    local_path=local_report_pickle_path)
        except:
            pass

        # download yaml file
        yaml_file = YamlFile(
            agent_type=AgentType.TOURNAMENT.value,
            bucket=s3_bucket,
            s3_key=get_s3_key(s3_prefix, s3_yaml_name),
            region_name=s3_region,
            s3_endpoint_url=s3_endpoint_url,
            local_path=YAML_LOCAL_PATH_FORMAT.format(s3_yaml_name))

        yaml_dict = yaml_file.get_yaml_values()

        if os.path.exists(local_queue_pickle_path):
            with open(local_queue_pickle_path, 'rb') as f:
                tournament_candidate_queue = pickle.load(f)
            with open(local_report_pickle_path, 'rb') as f:
                tournament_report = pickle.load(f)
            logger.info('tournament_candidate_queue loaded from existing file')
        else:
            logger.info('tournament_candidate_queue initialized')
            tournament_candidate_queue = deque()
            for agent_idx, _ in enumerate(
                    yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value]):
                tournament_candidate_queue.append((
                    yaml_dict[YamlKey.MODEL_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MODEL_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MODEL_METADATA_FILE_S3_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.METRICS_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.METRICS_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.SIMTRACE_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.SIMTRACE_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MP4_S3_BUCKET_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.MP4_S3_PREFIX_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.DISPLAY_NAME_YAML_KEY.value][agent_idx],
                    # TODO: Deprecate the DISPLAY_NAME and use only the RACER_NAME without if else check
                    "" if None in yaml_dict.get(YamlKey.RACER_NAME_YAML_KEY.value, [None]) \
                        else yaml_dict[YamlKey.RACER_NAME_YAML_KEY.value][agent_idx],
                    yaml_dict[YamlKey.BODY_SHELL_TYPE_YAML_KEY.value][agent_idx]
                ))
            tournament_report = {"race_results": []}

        race_idx = len(tournament_report["race_results"])
        while len(tournament_candidate_queue) > 1:
            car1 = tournament_candidate_queue.popleft()
            car2 = tournament_candidate_queue.popleft()
            (car1_model_s3_bucket, car1_s3_prefix, car1_model_metadata,
             car1_metrics_bucket, car1_metrics_s3_key, car1_simtrace_bucket,
             car1_simtrace_prefix, car1_mp4_bucket, car1_mp4_prefix,
             car1_display_name, car1_racer_name, car1_body_shell_type) = car1
            (car2_model_s3_bucket, car2_s3_prefix, car2_model_metadata,
             car2_metrics_bucket, car2_metrics_s3_key, car2_simtrace_bucket,
             car2_simtrace_prefix, car2_mp4_bucket, car2_mp4_prefix,
             car2_display_name, car2_racer_name, car2_body_shell_type) = car2

            race_yaml_dict = generate_race_yaml(yaml_dict=yaml_dict,
                                                car1=car1,
                                                car2=car2,
                                                race_idx=race_idx)

            if s3_endpoint_url is not None:
                race_yaml_dict["S3_ENDPOINT_URL"] = s3_endpoint_url

            race_model_s3_buckets = [
                car1_model_s3_bucket, car2_model_s3_bucket
            ]
            race_model_metadatas = [car1_model_metadata, car2_model_metadata]
            body_shell_types = [car1_body_shell_type, car2_body_shell_type]

            # List of directories created
            dirs_to_delete = list()
            yaml_dir = os.path.abspath(os.path.join(os.getcwd(),
                                                    str(race_idx)))
            os.makedirs(yaml_dir)

            dirs_to_delete.append(yaml_dir)
            race_yaml_path = os.path.abspath(
                os.path.join(yaml_dir, 'evaluation_params.yaml'))
            with open(race_yaml_path, 'w') as race_yaml_file:
                yaml.dump(race_yaml_dict, race_yaml_file)

            # List of racecar names that should include second camera while launching
            racecars_with_stereo_cameras = list()
            # List of racecar names that should include lidar while launching
            racecars_with_lidars = list()
            # List of SimApp versions
            simapp_versions = list()
            for agent_index, model_s3_bucket in enumerate(
                    race_model_s3_buckets):
                racecar_name = 'racecar_' + str(agent_index)
                json_key = race_model_metadatas[agent_index]
                # download model metadata
                try:
                    model_metadata = ModelMetadata(
                        bucket=model_s3_bucket,
                        s3_key=json_key,
                        region_name=s3_region,
                        s3_endpoint_url=s3_endpoint_url,
                        local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format(
                            racecar_name))
                    dirs_to_delete.append(model_metadata.local_dir)
                except Exception as e:
                    log_and_exit(
                        "Failed to download model_metadata file: s3_bucket: {}, s3_key: {}, {}"
                        .format(model_s3_bucket, json_key,
                                e), SIMAPP_SIMULATION_WORKER_EXCEPTION,
                        SIMAPP_EVENT_ERROR_CODE_500)
                sensors, _, simapp_version = model_metadata.get_model_metadata_info(
                )
                simapp_versions.append(str(simapp_version))
                if Input.STEREO.value in sensors:
                    racecars_with_stereo_cameras.append(racecar_name)
                if Input.LIDAR.value in sensors or Input.SECTOR_LIDAR.value in sensors:
                    racecars_with_lidars.append(racecar_name)

            cmd = [
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             "tournament_race_node.py"),
                str(race_idx), race_yaml_path,
                ','.join(racecars_with_stereo_cameras),
                ','.join(racecars_with_lidars), ','.join(simapp_versions),
                ','.join(body_shell_types)
            ]
            try:
                return_code, _, stderr = run_cmd(cmd_args=cmd,
                                                 shell=False,
                                                 stdout=None,
                                                 stderr=None)
            except KeyboardInterrupt:
                logger.info(
                    "KeyboardInterrupt raised, SimApp must be faulted! exiting..."
                )
                return

            # Retrieve winner and append tournament report
            with open('race_report.pkl', 'rb') as f:
                race_report = pickle.load(f)
            race_report['race_idx'] = race_idx
            winner = car1 if race_report[
                'winner'] == car1_display_name else car2
            logger.info("race {}'s winner: {}".format(race_idx,
                                                      race_report['winner']))

            tournament_candidate_queue.append(winner)
            tournament_report["race_results"].append(race_report)

            # Clean up directories created
            for dir_to_delete in dirs_to_delete:
                shutil.rmtree(dir_to_delete, ignore_errors=True)
            race_idx += 1

            s3_extra_args = get_s3_kms_extra_args()
            # Persist latest queue and report to use after job restarts.
            with open(local_queue_pickle_path, 'wb') as f:
                pickle.dump(tournament_candidate_queue, f, protocol=2)
            s3_client.upload_file(bucket=s3_bucket,
                                  s3_key=queue_pickle_s3_key,
                                  local_path=local_queue_pickle_path,
                                  s3_kms_extra_args=s3_extra_args)

            with open(local_report_pickle_path, 'wb') as f:
                pickle.dump(tournament_report, f, protocol=2)

            s3_client.upload_file(bucket=s3_bucket,
                                  s3_key=report_pickle_s3_key,
                                  local_path=local_report_pickle_path,
                                  s3_kms_extra_args=s3_extra_args)

            # If there is more than 1 candidates then restart the simulation job otherwise
            # tournament is finished, persists final report and ends the job.
            if len(tournament_candidate_queue) > 1:
                restart_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
                break
            else:
                # Persist final tournament report in json format
                # and terminate the job by canceling it
                s3_client.put_object(bucket=s3_bucket,
                                     s3_key=final_report_s3_key,
                                     body=json.dumps(tournament_report),
                                     s3_kms_extra_args=s3_extra_args)

                cancel_simulation_job(
                    os.environ.get('AWS_ROBOMAKER_SIMULATION_JOB_ARN'),
                    s3_region)
    except ValueError as ex:
        log_and_exit("User modified model_metadata.json: {}".format(ex),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_400)
    except Exception as e:
        log_and_exit("Tournament node failed: {}".format(e),
                     SIMAPP_SIMULATION_WORKER_EXCEPTION,
                     SIMAPP_EVENT_ERROR_CODE_500)
コード例 #28
0
def main():
    screen.set_use_colors(False)

    parser = argparse.ArgumentParser()
    parser.add_argument('-pk', '--preset_s3_key',
                        help="(string) Name of a preset to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('-ek', '--environment_s3_key',
                        help="(string) Name of an environment file to download from S3",
                        type=str,
                        required=False)
    parser.add_argument('--model_metadata_s3_key',
                        help="(string) Model Metadata File S3 Key",
                        type=str,
                        required=False)
    parser.add_argument('-c', '--checkpoint-dir',
                        help='(string) Path to a folder containing a checkpoint to write the model to.',
                        type=str,
                        default='./checkpoint')
    parser.add_argument('--pretrained-checkpoint-dir',
                        help='(string) Path to a folder for downloading a pre-trained model',
                        type=str,
                        default=PRETRAINED_MODEL_DIR)
    parser.add_argument('--s3_bucket',
                        help='(string) S3 bucket',
                        type=str,
                        default=os.environ.get("SAGEMAKER_SHARED_S3_BUCKET_PATH", "gsaur-test"))
    parser.add_argument('--s3_prefix',
                        help='(string) S3 prefix',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--s3_endpoint_url',
                        help='(string) S3 endpoint URL',
                        type=str,
                        default=os.environ.get("S3_ENDPOINT_URL", None))                            
    parser.add_argument('--framework',
                        help='(string) tensorflow or mxnet',
                        type=str,
                        default='tensorflow')
    parser.add_argument('--pretrained_s3_bucket',
                        help='(string) S3 bucket for pre-trained model',
                        type=str)
    parser.add_argument('--pretrained_s3_prefix',
                        help='(string) S3 prefix for pre-trained model',
                        type=str,
                        default='sagemaker')
    parser.add_argument('--aws_region',
                        help='(string) AWS region',
                        type=str,
                        default=os.environ.get("AWS_REGION", "us-east-1"))

    args, _ = parser.parse_known_args()
    logger.info("S3 bucket: %s \n S3 prefix: %s \n S3 endpoint URL: %s", args.s3_bucket, args.s3_prefix, args.s3_endpoint_url)

    s3_client = SageS3Client(bucket=args.s3_bucket, s3_prefix=args.s3_prefix, aws_region=args.aws_region, s3_endpoint_url=args.s3_endpoint_url)

    # download model metadata
    # TODO: replace 'agent' with name of each agent
    model_metadata_download = ModelMetadata(bucket=args.s3_bucket,
                                            s3_key=args.model_metadata_s3_key,
                                            region_name=args.aws_region,
                                            s3_endpoint_url=args.s3_endpoint_url,
                                            local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    _, network_type, version = model_metadata_download.get_model_metadata_info()

    # upload model metadata
    model_metadata_upload = ModelMetadata(bucket=args.s3_bucket,
                                          s3_key=get_s3_key(args.s3_prefix, MODEL_METADATA_S3_POSTFIX),
                                          region_name=args.aws_region,
                                          s3_endpoint_url=args.s3_endpoint_url,
                                          local_path=MODEL_METADATA_LOCAL_PATH_FORMAT.format('agent'))
    model_metadata_upload.persist(s3_kms_extra_args=utils.get_s3_kms_extra_args())

    shutil.copy2(model_metadata_download.local_path, SM_MODEL_OUTPUT_DIR)

    success_custom_preset = False
    if args.preset_s3_key:
        preset_local_path = "./markov/presets/preset.py"
        success_custom_preset = s3_client.download_file(s3_key=args.preset_s3_key, local_path=preset_local_path)
        if not success_custom_preset:
            logger.info("Could not download the preset file. Using the default DeepRacer preset.")
        else:
            preset_location = "markov.presets.preset:graph_manager"
            graph_manager = short_dynamic_import(preset_location, ignore_module_case=True)
            success_custom_preset = s3_client.upload_file(
                s3_key=os.path.normpath("%s/presets/preset.py" % args.s3_prefix), local_path=preset_local_path)
            if success_custom_preset:
                logger.info("Using preset: %s" % args.preset_s3_key)

    if not success_custom_preset:
        params_blob = os.environ.get('SM_TRAINING_ENV', '')
        if params_blob:
            params = json.loads(params_blob)
            sm_hyperparams_dict = params["hyperparameters"]
        else:
            sm_hyperparams_dict = {}

        #! TODO each agent should have own config
        agent_config = {'model_metadata': model_metadata_download,
                        ConfigParams.CAR_CTRL_CONFIG.value: {ConfigParams.LINK_NAME_LIST.value: [],
                                           ConfigParams.VELOCITY_LIST.value : {},
                                           ConfigParams.STEERING_LIST.value : {},
                                           ConfigParams.CHANGE_START.value : None,
                                           ConfigParams.ALT_DIR.value : None,
                                           ConfigParams.ACTION_SPACE_PATH.value : model_metadata_download.local_path,
                                           ConfigParams.REWARD.value : None,
                                           ConfigParams.AGENT_NAME.value : 'racecar'}}

        agent_list = list()
        agent_list.append(create_training_agent(agent_config))

        graph_manager, robomaker_hyperparams_json = get_graph_manager(hp_dict=sm_hyperparams_dict,
                                                                      agent_list=agent_list,
                                                                      run_phase_subject=None)

        # Upload hyperparameters to SageMaker shared s3 bucket
        hyperparameters = Hyperparameters(bucket=args.s3_bucket,
                                          s3_key=get_s3_key(args.s3_prefix, HYPERPARAMETER_S3_POSTFIX),
                                          region_name=args.aws_region,
                                          s3_endpoint_url=args.s3_endpoint_url)
        hyperparameters.persist(hyperparams_json=robomaker_hyperparams_json,
                                s3_kms_extra_args=utils.get_s3_kms_extra_args())

        # Attach sample collector to graph_manager only if sample count > 0
        max_sample_count = int(sm_hyperparams_dict.get("max_sample_count", 0))
        if max_sample_count > 0:
            sample_collector = SampleCollector(s3_client=s3_client, s3_prefix=args.s3_prefix,
                                               max_sample_count=max_sample_count,
                                               sampling_frequency=int(sm_hyperparams_dict.get("sampling_frequency", 1)))
            graph_manager.sample_collector = sample_collector

    host_ip_address = utils.get_ip_from_host()
    s3_client.write_ip_config(host_ip_address)
    logger.info("Uploaded IP address information to S3: %s" % host_ip_address)
    use_pretrained_model = args.pretrained_s3_bucket and args.pretrained_s3_prefix
    # Handle backward compatibility
    if use_pretrained_model:
        if version < SIMAPP_VERSION_2 and \
        not utils.has_current_ckpnt_name(args.pretrained_s3_bucket, args.pretrained_s3_prefix, args.aws_region, args.s3_endpoint_url):
            utils.make_compatible(args.pretrained_s3_bucket, args.pretrained_s3_prefix,
                                args.aws_region, SyncFiles.TRAINER_READY.value)
        #Select the optimal model for the starting weights
        utils.do_model_selection(s3_bucket=args.s3_bucket,
                                 s3_prefix=args.s3_prefix,
                                 region=args.aws_region,
                                 s3_endpoint_url=args.s3_endpoint_url)

        ds_params_instance_pretrained = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                                  bucket_names={'agent':args.pretrained_s3_bucket},
                                                                  base_checkpoint_dir=args.pretrained_checkpoint_dir,
                                                                  s3_folders={'agent':args.pretrained_s3_prefix},
                                                                  s3_endpoint_url=args.s3_endpoint_url)
        data_store_pretrained = S3BotoDataStore(ds_params_instance_pretrained, graph_manager, True)
        data_store_pretrained.load_from_store()

    memory_backend_params = DeepRacerRedisPubSubMemoryBackendParameters(redis_address="localhost",
                                                                        redis_port=6379,
                                                                        run_type=str(RunType.TRAINER),
                                                                        channel=args.s3_prefix,
                                                                        network_type=network_type)

    graph_manager.memory_backend_params = memory_backend_params

    ds_params_instance = S3BotoDataStoreParameters(aws_region=args.aws_region,
                                                   bucket_names={'agent':args.s3_bucket},
                                                   base_checkpoint_dir=args.checkpoint_dir,
                                                   s3_folders={'agent':args.s3_prefix},
                                                   s3_endpoint_url=args.s3_endpoint_url)

    graph_manager.data_store_params = ds_params_instance

    graph_manager.data_store = S3BotoDataStore(ds_params_instance, graph_manager)

    task_parameters = TaskParameters()
    task_parameters.experiment_path = SM_MODEL_OUTPUT_DIR
    task_parameters.checkpoint_save_secs = 20
    if use_pretrained_model:
        task_parameters.checkpoint_restore_path = args.pretrained_checkpoint_dir
    task_parameters.checkpoint_save_dir = args.checkpoint_dir

    training_worker(
        graph_manager=graph_manager,
        task_parameters=task_parameters,
        user_batch_size=json.loads(robomaker_hyperparams_json)["batch_size"],
        user_episode_per_rollout=json.loads(robomaker_hyperparams_json)["num_episodes_between_training"]
    )