def set_task_status( bucket: Bucket, job_id: str, task_id: str, state: TaskState, worker: Optional[str] = _LOCAL_FQDN, ): """Set the status of a task. Uploads the JSON serialization of a TaskStatus into a bucket, recording its present state. Parameters ---------- bucket : Bucket The Google Cloud Storage bucket that hosts the given job and task. job_id : str The ID of the job. task_id : str The ID of the task. state : TaskState The state of the task. worker : Optional[str] An identifier for the worker reporting the state of the task (or None if no worker is handling the task). """ if state == TaskState.REQUESTED: worker = None status = TaskStatus(state, worker) blob_path = _task_status_path(job_id, task_id) bucket.blob(blob_path).upload_from_string(status.to_bytes())
def upload_task_inputs(bucket: Bucket, task: Task, orbit: Orbits): """Uploads the inputs required to execute a specific task. These inputs are uploaded into the given bucket. The related method upload_job_inputs should also be executed, but just once for all tasks in a job; it uploads the observations and configuration. Parameters ---------- bucket : Bucket The bucket hosting the task's job. task : Task The Task to be executed. orbit : Orbits The test orbits to use in a THOR run. """ # Upload orbit orbit_buf = io.BytesIO() orbit.to_csv(orbit_buf) orbit_bytes = orbit_buf.getvalue() orbit_path = _task_input_path(task.job_id, task.task_id, "orbit.csv") logger.info("uploading task input %s", orbit_path) bucket.blob(orbit_path).upload_from_string(orbit_bytes)
def _upload_failure(self, bucket: Bucket, result_directory: str, exception: Exception): output_blobdir = _task_output_path(self.job_id, self.task_id) exception_string = traceback.format_exception( etype=type(exception), value=exception, tb=exception.__traceback__, ) blobpath = posixpath.join(output_blobdir, "error_message.txt") logger.error("uploading exception trace to %s", blobpath) bucket.blob(blobpath).upload_from_string(exception_string) self._upload_results(self.bucket, result_directory) raise NotImplementedError()
def _upload_results(self, bucket: Bucket, result_directory: str): # Task-wide directory in the bucket where results go output_blobdir = _task_output_path(self.job_id, self.task_id) for (dirpath, _, filenames) in os.walk(result_directory): # Trim off the result_directory prefix from dirpath. relative_dir = os.path.relpath(dirpath, result_directory) for filename in filenames: # filepath is the path of the file locally filepath = os.path.join(dirpath, filename) # blobpath is the path that we want to use remotely. blobpath = posixpath.join( output_blobdir, relative_dir, filename, ) logger.debug("uploading %s to %s", filepath, blobpath) bucket.blob(blobpath).upload_from_filename(filepath)
def download_task_inputs( bucket: Bucket, task: Task) -> Tuple[Configuration, pd.DataFrame, Orbits]: """Download the data required to process this task. All data are downloaded into memory, not onto disk anywhere. Parameters ---------- bucket : Bucket The bucket hosting the task. task : Task The Task to be performed. Returns ------- Tuple[Configuration, pd.DataFrame, Orbits] The configuration, observations, and orbits that form the inputs to a runTHOR task. """ cfg_path = _job_input_path(task.job_id, "config.yml") logger.info("downloading task input %s", cfg_path) cfg_bytes = bucket.blob(cfg_path).download_as_string() config = Configuration().fromYamlString(cfg_bytes.decode("utf8")) obs_path = _job_input_path(task.job_id, "observations.csv") logger.info("downloading task input %s", obs_path) obs_bytes = bucket.blob(obs_path).download_as_string() observations = pd.read_csv( io.BytesIO(obs_bytes), index_col=False, dtype={"obs_id": str}, ) orbit_path = _task_input_path(task.job_id, task.task_id, "orbit.csv") logger.info("downloading task input %s", orbit_path) orbit_bytes = bucket.blob(orbit_path).download_as_string() orbit = Orbits.from_csv(io.BytesIO(orbit_bytes)) return (config, observations, orbit)
def gcp_covdir_exists(bucket: Bucket, repository: str, revision: str, platform: str, suite: str) -> bool: """ Check if a covdir report exists on the Google Cloud Storage bucket """ path = GCP_COVDIR_PATH.format(repository=repository, revision=revision, platform=platform, suite=suite) blob = bucket.blob(path) return blob.exists()
def upload_job_inputs(bucket: Bucket, job_id: str, config: Configuration, observations: pd.DataFrame): """Upload all the inputs required to execute a task. These inputs are uploaded into the given bucket. This function uploads the inputs that are common to all tasks in a job: the configuration and the observations. The related method upload_task_inputs uploads the inputs that are specific to a single task, namely the orbits. Parameters ---------- bucket : Bucket The bucket hosting the job. job_id : str The ID of the job. config : Configuration A THOR configuration which the Task executors should use. observations : pd.DataFrame The preprocessed observations which should be used by task executors. """ # Upload configuration file cfg_bytes = config.toYamlString() cfg_path = _job_input_path(job_id, "config.yml") logger.info("uploading job input %s", cfg_path) bucket.blob(cfg_path).upload_from_string(cfg_bytes) # Upload observations observations_buf = io.BytesIO() observations.to_csv(observations_buf, index=False) observations_bytes = observations_buf.getvalue() observations_path = _job_input_path(job_id, "observations.csv") logger.info("uploading job input %s", observations_path) bucket.blob(observations_path).upload_from_string(observations_bytes)
def get_task_status(bucket: Bucket, job_id: str, task_id: str) -> TaskStatus: """Get the status of a task. Parameters ---------- bucket : Bucket The Google Cloud Storage bucket that hosts th egiven job and task. job_id : str The ID of the job. task_id : str The ID of the task. Returns ------- TaskStatus The status of the Task. """ blob_path = _task_status_path(job_id, task_id) status_str = bucket.blob(blob_path).download_as_string() return TaskStatus.from_bytes(status_str)
def _copy_local_directory_to_gcs(_local_path: str, _bucket: Bucket, _gcs_path: str, _n_to_remove: int = 0): for local_file in glob.glob(_local_path + '/**'): if not os.path.isfile(local_file): _copy_local_directory_to_gcs(local_file, _bucket, _gcs_path, _n_to_remove) continue remote_path_tmp = os.path.join(_gcs_path, local_file[_n_to_remove:]) if 'part' in remote_path_tmp: now = datetime.now() day, hour = now.day, now.hour folder = '/'.join(remote_path_tmp.split('/')[:-1]) + '/' file_name = remote_path_tmp.split('/')[-1] remote_path = (folder + file_name.split('.')[0] + '-{}{}.'.format(day, hour) + '.'.join(file_name.split('.')[1:])) else: remote_path = remote_path_tmp blob = _bucket.blob(remote_path) blob.upload_from_filename(local_file)
def _download(bucket: Bucket, data_path: str): blob = bucket.blob(data_path) blob.download_to_filename(data_path)
def upload_data_to_bucket(bucket: Bucket): """Upload data to a GCS bucket""" blob = bucket.blob(object_name) blob.upload_from_filename(product_resource_file) print("Data from {} has being uploaded to {}".format( product_resource_file, bucket.name))
class StorageUtil: """ Auxiliary class for Google Cloud Storage Example: buckets = '' for bucket in Storage.buckets(): buckets += f'{bucket}\n' project_def = ProjectDefinition('project_id', 'location') strge = StorageUtil('bucket-name', 'all_buckets.txt', project=project_def).new_content(buckets) print(f'File content [{strge.file_name}]: {strge.get_content()}') """ client = storage.Client() def __init__(self, bucket_name, file_name=None, project=None, project_id=None, location=None): self.project = project if project else ProjectReference( project_id, location) self._bucket_name = bucket_name self._bucket = None self._blob = None self._file_name = file_name if file_name: self.set_blob(file_name) @property def bucket(self): if not self._bucket: try: self._bucket = self.client.get_bucket(self._bucket_name) except NotFound: self._bucket = Bucket(client=self.client, name=self._bucket_name) self._bucket.create(client=self.client, location=self.project.location) logging.info('Bucket {} not found and was created.'.format( self._bucket.name)) return self._bucket @property def blob(self): if not self._blob: raise NoBlobSetException() return self._blob @property def file_name(self): if not self._blob: raise NoBlobSetException() return self._file_name def set_blob(self, file_name): self._file_name = file_name self._blob = self.bucket.get_blob(file_name) if not self._blob: self._blob = self._bucket.blob(file_name) logging.info(f'File not found and was created: {file_name}') return self def delete_blob(self): try: self.bucket.delete_blob(self.file_name) logging.info(f'File deleted: {self.file_name}') except NotFound: logging.info(f'File not found: {self.file_name}') self._file_name = None self._blob = None return self def new_content(self, new_content: str): content_len = 80 content = f'{new_content[:content_len]} [...]' if len( new_content) > content_len else new_content logging.info( f'Uploading new content to file "{self._file_name}": {repr(content)}' ) self.blob.upload_from_string(new_content) return self def get_content(self): return self.blob.download_as_string() @classmethod def buckets(cls): return [b for b in cls.client.list_buckets()]