Ejemplo n.º 1
0
    def load_from_store(self, expected_checkpoint_number=-1):
        try:
            s3_client = self._get_client()
            base_checkpoint_dir = self.params.base_checkpoint_dir
            for agent_key, bucket in self.params.buckets.items():
                checkpoint_dir = base_checkpoint_dir if len(self.graph_manager.agents_params) == 1 else os.path.join(base_checkpoint_dir, agent_key)
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                while True:
                    s3_client = self._get_client()
                    state_file = CheckpointStateFile(os.path.abspath(checkpoint_dir))

                    # wait until lock is removed
                    response = s3_client.list_objects_v2(Bucket=bucket,
                                                         Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value, agent_key))
                    if "Contents" not in response or self.ignore_lock:
                        try:
                            checkpoint_file_path = os.path.abspath(os.path.join(checkpoint_dir,
                                                                                state_file.path))
                            # fetch checkpoint state file from S3
                            s3_client.download_file(Bucket=bucket,
                                                    Key=self._get_s3_key(state_file.filename, agent_key),
                                                    Filename=checkpoint_file_path)
                        except botocore.exceptions.ClientError:
                            if self.ignore_lock:
                                log_and_exit("Checkpoint not found",
                                             SIMAPP_S3_DATA_STORE_EXCEPTION,
                                             SIMAPP_EVENT_ERROR_CODE_400)
                            time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                            continue
                        except Exception:
                            if self.ignore_lock:
                                log_and_exit("Checkpoint not found",
                                             SIMAPP_S3_DATA_STORE_EXCEPTION,
                                             SIMAPP_EVENT_ERROR_CODE_500)
                            time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                            continue
                    else:
                        time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                        continue

                    # check if there's a Finished file
                    response = s3_client.list_objects_v2(Bucket=bucket,
                                                         Prefix=self._get_s3_key(SyncFiles.FINISHED.value, agent_key))
                    if "Contents" in response:
                        try:
                            finished_file_path = os.path.abspath(os.path.join(checkpoint_dir,
                                                                              SyncFiles.FINISHED.value))
                            s3_client.download_file(Bucket=bucket,
                                                    Key=self._get_s3_key(SyncFiles.FINISHED.value, agent_key),
                                                    Filename=finished_file_path)
                        except Exception:
                            pass

                    # check if there's a Ready file
                    response = s3_client.list_objects_v2(Bucket=bucket,
                                                         Prefix=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key))
                    if "Contents" in response:
                        try:
                            ready_file_path = os.path.abspath(os.path.join(checkpoint_dir,
                                                                           SyncFiles.TRAINER_READY.value))
                            s3_client.download_file(Bucket=bucket,
                                                    Key=self._get_s3_key(SyncFiles.TRAINER_READY.value, agent_key),
                                                    Filename=ready_file_path)
                        except Exception:
                            pass

                    checkpoint_state = state_file.read()
                    if checkpoint_state is not None:

                        # if we get a checkpoint that is older that the expected checkpoint, we wait for
                        #  the new checkpoint to arrive.

                        if checkpoint_state.num < expected_checkpoint_number:
                            time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                            continue

                        response = s3_client.list_objects_v2(Bucket=bucket,
                                                             Prefix=self._get_s3_key("", agent_key))
                        if "Contents" in response:
                            # Check to see if the desired checkpoint is in the bucket
                            has_chkpnt = any(list(map(lambda obj: os.path.split(obj['Key'])[1].\
                                                                startswith(checkpoint_state.name),
                                                      response['Contents'])))
                            for obj in response["Contents"]:
                                full_key_prefix = os.path.normpath(self.key_prefixes[agent_key]) + "/"
                                filename = os.path.abspath(os.path.join(checkpoint_dir,
                                                                        obj["Key"].\
                                                                        replace(full_key_prefix, "")))
                                dirname, basename = os.path.split(filename)
                                # Download all the checkpoints but not the frozen models since they
                                # are not necessary
                                _, file_extension = os.path.splitext(obj["Key"])
                                if file_extension != '.pb' \
                                and (basename.startswith(checkpoint_state.name) or not has_chkpnt):
                                    if not os.path.exists(dirname):
                                        os.makedirs(dirname)
                                    s3_client.download_file(Bucket=bucket,
                                                            Key=obj["Key"],
                                                            Filename=filename)
                            # Change the coach checkpoint file to point to the latest available checkpoint,
                            # also log that we are changing the checkpoint.
                            if not has_chkpnt:
                                all_ckpnts = _filter_checkpoint_files(os.listdir(checkpoint_dir))
                                if all_ckpnts:
                                    LOG.info("%s not in s3 bucket, downloading all checkpoints \
                                                and using %s", checkpoint_state.name, all_ckpnts[-1])
                                    state_file.write(all_ckpnts[-1])
                                else:
                                    log_and_exit("No checkpoint files",
                                                 SIMAPP_S3_DATA_STORE_EXCEPTION,
                                                 SIMAPP_EVENT_ERROR_CODE_400)
                    break
            return True

        except botocore.exceptions.ClientError:
            log_and_exit("Unable to download checkpoint",
                         SIMAPP_S3_DATA_STORE_EXCEPTION,
                         SIMAPP_EVENT_ERROR_CODE_400)
        except Exception:
            log_and_exit("Unable to download checkpoint",
                         SIMAPP_S3_DATA_STORE_EXCEPTION,
                         SIMAPP_EVENT_ERROR_CODE_500)
    def load_from_store(self, expected_checkpoint_number=-1):
        try:
            if not os.path.exists(self.params.checkpoint_dir):
                os.makedirs(self.params.checkpoint_dir)

            while True:
                s3_client = self._get_client()
                state_file = CheckpointStateFile(os.path.abspath(self.params.checkpoint_dir))

                # wait until lock is removed
                response = s3_client.list_objects_v2(Bucket=self.params.bucket,
                                                     Prefix=self._get_s3_key(SyncFiles.LOCKFILE.value))
                if "Contents" not in response:
                    try:
                        # fetch checkpoint state file from S3
                        s3_client.download_file(Bucket=self.params.bucket,
                                                Key=self._get_s3_key(state_file.filename),
                                                Filename=state_file.path)
                    except Exception as e:
                        time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                        continue
                else:
                    time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                    continue

                # check if there's a Finished file
                response = s3_client.list_objects_v2(Bucket=self.params.bucket,
                                                     Prefix=self._get_s3_key(SyncFiles.FINISHED.value))
                if "Contents" in response:
                    try:
                        finished_file_path = os.path.abspath(os.path.join(self.params.checkpoint_dir,
                                                                          SyncFiles.FINISHED.value))
                        s3_client.download_file(Bucket=self.params.bucket,
                                                Key=self._get_s3_key(SyncFiles.FINISHED.value),
                                                Filename=finished_file_path)
                    except Exception as e:
                        pass

                # check if there's a Ready file
                response = s3_client.list_objects_v2(Bucket=self.params.bucket,
                                                     Prefix=self._get_s3_key(SyncFiles.TRAINER_READY.value))
                if "Contents" in response:
                    try:
                        ready_file_path = os.path.abspath(os.path.join(self.params.checkpoint_dir,
                                                                       SyncFiles.TRAINER_READY.value))
                        s3_client.download_file(Bucket=self.params.bucket,
                                                Key=self._get_s3_key(SyncFiles.TRAINER_READY.value),
                                                Filename=ready_file_path)
                    except Exception as e:
                        pass

                checkpoint_state = state_file.read()
                if checkpoint_state is not None:

                    # if we get a checkpoint that is older that the expected checkpoint, we wait for
                    #  the new checkpoint to arrive.
                    if checkpoint_state.num < expected_checkpoint_number:
                        time.sleep(SLEEP_TIME_WHILE_WAITING_FOR_DATA_FROM_TRAINER_IN_SECOND)
                        continue

                    response = s3_client.list_objects_v2(Bucket=self.params.bucket,
                                                         Prefix=self._get_s3_key(""))
                    if "Contents" in response:
                        # Check to see if the desired checkpoint is in the bucket
                        has_chkpnt = any(list(map(lambda obj: os.path.split(obj['Key'])[1].\
                                                              startswith(checkpoint_state.name),
                                                  response['Contents'])))
                        for obj in response["Contents"]:
                            full_key_prefix = os.path.normpath(self.key_prefix) + "/"
                            filename = os.path.abspath(os.path.join(self.params.checkpoint_dir,
                                                                    obj["Key"].\
                                                                    replace(full_key_prefix, "")))
                            dirname, basename = os.path.split(filename)
                            # Download all the checkpoints but not the frozen models since they
                            # are not necessary
                            _, file_extension = os.path.splitext(obj["Key"])
                            if file_extension != '.pb' \
                            and (basename.startswith(checkpoint_state.name) or not has_chkpnt):
                                if not os.path.exists(dirname):
                                    os.makedirs(dirname)
                                s3_client.download_file(Bucket=self.params.bucket,
                                                        Key=obj["Key"],
                                                        Filename=filename)
                        # Change the coach checkpoint file to point to the latest available checkpoint,
                        # also log that we are changing the checkpoint.
                        if not has_chkpnt:
                            all_ckpnts = _filter_checkpoint_files(os.listdir(self.params.checkpoint_dir))
                            if all_ckpnts:
                                logger.info("%s not in s3 bucket, downloading all checkpoints \
                                            and using %s", checkpoint_state.name, all_ckpnts[-1])
                                state_file.write(all_ckpnts[-1])
                            else:
                                utils.json_format_logger("No checkpoint files found in {}".format(self.params.bucket),
                                                         **utils.build_user_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION,
                                                                                       utils.SIMAPP_EVENT_ERROR_CODE_400))
                                utils.simapp_exit_gracefully()
                return True

        except botocore.exceptions.ClientError as e:
            utils.json_format_logger("Unable to download checkpoint from {}, {}"
                                     .format(self.params.bucket, e.response['Error']['Code']),
                                     **utils.build_user_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION,
                                                                   utils.SIMAPP_EVENT_ERROR_CODE_400))
            utils.simapp_exit_gracefully()
        except Exception as e:
            utils.json_format_logger("Unable to download checkpoint from {}, {}"
                                     .format(self.params.bucket, e),
                                     **utils.build_system_error_dict(utils.SIMAPP_S3_DATA_STORE_EXCEPTION,
                                                                     utils.SIMAPP_EVENT_ERROR_CODE_500))
            utils.simapp_exit_gracefully()