Example #1
0
def set_task_status(
    bucket: Bucket,
    job_id: str,
    task_id: str,
    state: TaskState,
    worker: Optional[str] = _LOCAL_FQDN,
):
    """Set the status of a task.

    Uploads the JSON serialization of a TaskStatus into a bucket, recording its
    present state.

    Parameters
    ----------
    bucket : Bucket
        The Google Cloud Storage bucket that hosts the given job and task.
    job_id : str
        The ID of the job.
    task_id : str
        The ID of the task.
    state : TaskState
        The state of the task.
    worker : Optional[str]
        An identifier for the worker reporting the state of the task (or None if
        no worker is handling the task).
    """

    if state == TaskState.REQUESTED:
        worker = None
    status = TaskStatus(state, worker)
    blob_path = _task_status_path(job_id, task_id)
    bucket.blob(blob_path).upload_from_string(status.to_bytes())
Example #2
0
def upload_task_inputs(bucket: Bucket, task: Task, orbit: Orbits):
    """Uploads the inputs required to execute a specific task.

    These inputs are uploaded into the given bucket. The related method
    upload_job_inputs should also be executed, but just once for all tasks in a
    job; it uploads the observations and configuration.

    Parameters
    ----------
    bucket : Bucket
        The bucket hosting the task's job.
    task : Task
        The Task to be executed.
    orbit : Orbits
        The test orbits to use in a THOR run.

    """

    # Upload orbit
    orbit_buf = io.BytesIO()
    orbit.to_csv(orbit_buf)
    orbit_bytes = orbit_buf.getvalue()

    orbit_path = _task_input_path(task.job_id, task.task_id, "orbit.csv")
    logger.info("uploading task input %s", orbit_path)
    bucket.blob(orbit_path).upload_from_string(orbit_bytes)
Example #3
0
    def _upload_failure(self, bucket: Bucket, result_directory: str,
                        exception: Exception):
        output_blobdir = _task_output_path(self.job_id, self.task_id)
        exception_string = traceback.format_exception(
            etype=type(exception),
            value=exception,
            tb=exception.__traceback__,
        )
        blobpath = posixpath.join(output_blobdir, "error_message.txt")
        logger.error("uploading exception trace to %s", blobpath)
        bucket.blob(blobpath).upload_from_string(exception_string)
        self._upload_results(self.bucket, result_directory)

        raise NotImplementedError()
Example #4
0
 def _upload_results(self, bucket: Bucket, result_directory: str):
     # Task-wide directory in the bucket where results go
     output_blobdir = _task_output_path(self.job_id, self.task_id)
     for (dirpath, _, filenames) in os.walk(result_directory):
         # Trim off the result_directory prefix from dirpath.
         relative_dir = os.path.relpath(dirpath, result_directory)
         for filename in filenames:
             # filepath is the path of the file locally
             filepath = os.path.join(dirpath, filename)
             # blobpath is the path that we want to use remotely.
             blobpath = posixpath.join(
                 output_blobdir,
                 relative_dir,
                 filename,
             )
             logger.debug("uploading %s to %s", filepath, blobpath)
             bucket.blob(blobpath).upload_from_filename(filepath)
Example #5
0
def download_task_inputs(
        bucket: Bucket,
        task: Task) -> Tuple[Configuration, pd.DataFrame, Orbits]:
    """Download the data required to process this task.

    All data are downloaded into memory, not onto disk anywhere.

    Parameters
    ----------
    bucket : Bucket
        The bucket hosting the task.
    task : Task
        The Task to be performed.

    Returns
    -------
    Tuple[Configuration, pd.DataFrame, Orbits]
        The configuration, observations, and orbits that form the inputs to a
        runTHOR task.

    """

    cfg_path = _job_input_path(task.job_id, "config.yml")
    logger.info("downloading task input %s", cfg_path)
    cfg_bytes = bucket.blob(cfg_path).download_as_string()
    config = Configuration().fromYamlString(cfg_bytes.decode("utf8"))

    obs_path = _job_input_path(task.job_id, "observations.csv")
    logger.info("downloading task input %s", obs_path)
    obs_bytes = bucket.blob(obs_path).download_as_string()
    observations = pd.read_csv(
        io.BytesIO(obs_bytes),
        index_col=False,
        dtype={"obs_id": str},
    )

    orbit_path = _task_input_path(task.job_id, task.task_id, "orbit.csv")
    logger.info("downloading task input %s", orbit_path)
    orbit_bytes = bucket.blob(orbit_path).download_as_string()
    orbit = Orbits.from_csv(io.BytesIO(orbit_bytes))

    return (config, observations, orbit)
Example #6
0
def gcp_covdir_exists(bucket: Bucket, repository: str, revision: str,
                      platform: str, suite: str) -> bool:
    """
    Check if a covdir report exists on the Google Cloud Storage bucket
    """
    path = GCP_COVDIR_PATH.format(repository=repository,
                                  revision=revision,
                                  platform=platform,
                                  suite=suite)
    blob = bucket.blob(path)
    return blob.exists()
Example #7
0
def upload_job_inputs(bucket: Bucket, job_id: str, config: Configuration,
                      observations: pd.DataFrame):
    """Upload all the inputs required to execute a task.

    These inputs are uploaded into the given bucket. This function uploads the
    inputs that are common to all tasks in a job: the configuration and the
    observations. The related method upload_task_inputs uploads the inputs that
    are specific to a single task, namely the orbits.

    Parameters
    ----------
    bucket : Bucket
        The bucket hosting the job.
    job_id : str
        The ID of the job.
    config : Configuration
        A THOR configuration which the Task executors should use.
    observations : pd.DataFrame
        The preprocessed observations which should be used by task executors.

    """

    # Upload configuration file
    cfg_bytes = config.toYamlString()
    cfg_path = _job_input_path(job_id, "config.yml")
    logger.info("uploading job input %s", cfg_path)
    bucket.blob(cfg_path).upload_from_string(cfg_bytes)

    # Upload observations
    observations_buf = io.BytesIO()
    observations.to_csv(observations_buf, index=False)
    observations_bytes = observations_buf.getvalue()

    observations_path = _job_input_path(job_id, "observations.csv")
    logger.info("uploading job input %s", observations_path)
    bucket.blob(observations_path).upload_from_string(observations_bytes)
Example #8
0
def get_task_status(bucket: Bucket, job_id: str, task_id: str) -> TaskStatus:
    """Get the status of a task.

    Parameters
    ----------
    bucket : Bucket
        The Google Cloud Storage bucket that hosts th egiven job and task.
    job_id : str
        The ID of the job.
    task_id : str
        The ID of the task.

    Returns
    -------
    TaskStatus
        The status of the Task.
    """

    blob_path = _task_status_path(job_id, task_id)
    status_str = bucket.blob(blob_path).download_as_string()
    return TaskStatus.from_bytes(status_str)
Example #9
0
 def _copy_local_directory_to_gcs(_local_path: str,
                                  _bucket: Bucket,
                                  _gcs_path: str,
                                  _n_to_remove: int = 0):
     for local_file in glob.glob(_local_path + '/**'):
         if not os.path.isfile(local_file):
             _copy_local_directory_to_gcs(local_file, _bucket, _gcs_path,
                                          _n_to_remove)
             continue
         remote_path_tmp = os.path.join(_gcs_path,
                                        local_file[_n_to_remove:])
         if 'part' in remote_path_tmp:
             now = datetime.now()
             day, hour = now.day, now.hour
             folder = '/'.join(remote_path_tmp.split('/')[:-1]) + '/'
             file_name = remote_path_tmp.split('/')[-1]
             remote_path = (folder + file_name.split('.')[0] +
                            '-{}{}.'.format(day, hour) +
                            '.'.join(file_name.split('.')[1:]))
         else:
             remote_path = remote_path_tmp
         blob = _bucket.blob(remote_path)
         blob.upload_from_filename(local_file)
Example #10
0
def _download(bucket: Bucket, data_path: str):
    blob = bucket.blob(data_path)
    blob.download_to_filename(data_path)
Example #11
0
def upload_data_to_bucket(bucket: Bucket):
    """Upload data to a GCS bucket"""
    blob = bucket.blob(object_name)
    blob.upload_from_filename(product_resource_file)
    print("Data from {} has being uploaded to {}".format(
        product_resource_file, bucket.name))
Example #12
0
class StorageUtil:
    """
    Auxiliary class for Google Cloud Storage

    Example:
        buckets = ''
        for bucket in Storage.buckets():
            buckets += f'{bucket}\n'

        project_def = ProjectDefinition('project_id', 'location')
        strge = StorageUtil('bucket-name', 'all_buckets.txt', project=project_def).new_content(buckets)
        print(f'File content [{strge.file_name}]: {strge.get_content()}')
    """

    client = storage.Client()

    def __init__(self,
                 bucket_name,
                 file_name=None,
                 project=None,
                 project_id=None,
                 location=None):
        self.project = project if project else ProjectReference(
            project_id, location)
        self._bucket_name = bucket_name
        self._bucket = None
        self._blob = None
        self._file_name = file_name
        if file_name:
            self.set_blob(file_name)

    @property
    def bucket(self):
        if not self._bucket:
            try:
                self._bucket = self.client.get_bucket(self._bucket_name)
            except NotFound:
                self._bucket = Bucket(client=self.client,
                                      name=self._bucket_name)
                self._bucket.create(client=self.client,
                                    location=self.project.location)
                logging.info('Bucket {} not found and was created.'.format(
                    self._bucket.name))

        return self._bucket

    @property
    def blob(self):
        if not self._blob:
            raise NoBlobSetException()
        return self._blob

    @property
    def file_name(self):
        if not self._blob:
            raise NoBlobSetException()
        return self._file_name

    def set_blob(self, file_name):
        self._file_name = file_name
        self._blob = self.bucket.get_blob(file_name)
        if not self._blob:
            self._blob = self._bucket.blob(file_name)
            logging.info(f'File not found and was created: {file_name}')
        return self

    def delete_blob(self):
        try:
            self.bucket.delete_blob(self.file_name)
            logging.info(f'File deleted: {self.file_name}')
        except NotFound:
            logging.info(f'File not found: {self.file_name}')

        self._file_name = None
        self._blob = None
        return self

    def new_content(self, new_content: str):
        content_len = 80
        content = f'{new_content[:content_len]} [...]' if len(
            new_content) > content_len else new_content
        logging.info(
            f'Uploading new content to file "{self._file_name}": {repr(content)}'
        )
        self.blob.upload_from_string(new_content)
        return self

    def get_content(self):
        return self.blob.download_as_string()

    @classmethod
    def buckets(cls):
        return [b for b in cls.client.list_buckets()]