def _open_gcs_url(self, binary) -> object: mode = "rb" if binary else "r" service_account_json = self._provider.get("service_account_json") credentials = None if service_account_json: try: credentials = json.loads( self._provider["service_account_json"]) except json.decoder.JSONDecodeError as err: error_msg = f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}" logger.error(error_msg) raise ConfigurationError(error_msg) from err if credentials: credentials = service_account.Credentials.from_service_account_info( credentials) client = GCSClient(credentials=credentials, project=credentials._project_id) else: client = GCSClient.create_anonymous_client() file_to_close = smart_open.open(self.full_url, transport_params=dict(client=client), mode=mode) return file_to_close
def main(): args = get_args() # Log in to Box LOG.info("Authenticating with Box and impersonating user {}.".format( args.login)) kms = kms_v1.KeyManagementServiceClient() config_ciphertext = open(args.config, 'rb').read() config_plaintext = kms.decrypt(args.keyname, config_ciphertext).plaintext box = impersonate_mirror_user(get_box_client(config_plaintext), args.login) # Log in to GCS and get bucket bucket_name = args.bucket.replace("gs://", "") LOG.info( "Authenticating with GCS and fetching bucket {}.".format(bucket_name)) bucket = GCSClient().get_bucket(bucket_name) # Walk Box, schedule async copies to GCS, and get a list of new blobs. # We will also opportunistically form a cache of the Box items. LOG.info("Walking Box directories and copying to GCS as needed.") box_cache = {'/': box.root_folder()} copy_jobs = sync_box_to_gcs(box, bucket, cache=box_cache) # Check for and log exceptions. Doing this here also institutes a "pause" between the two sync phases. for exc in get_exceptions(copy_jobs): LOG.exception(exc) # Walk GCS, checking against the cache of Box items, and as needed, schedule async uploads to Box LOG.info("Listing GCS blobs and looking for blobs to upload or delete.") copy_jobs = sync_gcs_to_box(bucket, box, cache=box_cache) # Check for and log exceptions for exc in get_exceptions(copy_jobs): LOG.exception(exc) LOG.info("Synchronization complete.")
def main(): args = parse_args() # Imports of thor modules are deferred until after argument parsing to avoid # numba JIT time if the arguments are invalid or the user asked for --help. import thor.utils.logging thor.utils.logging.setupLogger("thor") from thor.taskqueue.client import Client as TaskQueueClient from thor.taskqueue.queue import TaskQueueConnection from thor.orbits import Orbits from thor.config import Config if not isinstance(args.config, str): config = Config else: config = Config.fromYaml(args.config) # Read observations preprocessed_observations = pd.read_csv(args.preprocessed_observations, index_col=False, dtype={"obs_id": str}) # Read test orbits test_orbits = Orbits.from_csv(args.test_orbits) # Connect to Rabbit queue = TaskQueueConnection( pika.ConnectionParameters( host=args.rabbit_host, port=args.rabbit_port, credentials=pika.PlainCredentials( username=args.rabbit_username, password=args.rabbit_password, ), ), args.queue, ) queue.connect() # Connect to GCS bucket gcs = GCSClient() if args.create_bucket: try: gcs.create_bucket(args.bucket) except google.cloud.exceptions.Conflict: # Bucket already exists. pass bucket = gcs.bucket(args.bucket) taskqueue_client = TaskQueueClient(bucket, queue) manifest = taskqueue_client.launch_job(config, preprocessed_observations, test_orbits) taskqueue_client.monitor_job_status(manifest.job_id) taskqueue_client.download_results(manifest, args.out_dir)
def google_storage_bucket(request): client = GCSClient() bucket_name = f"test_bucket__{request.function.__name__}" try: bucket = client.create_bucket(bucket_name) except google.cloud.exceptions.Conflict: logger.warning("bucket %s already exists; tests may be unpredictable", bucket_name) bucket = client.bucket(bucket_name) yield bucket bucket.delete(force=True, client=client)
def main(project_id, dataset_id, bucket_name, hpo_id, folder_name): """ Main function to load submission into dataset :param project_id: Identifies the project :param dataset_id: Identifies the destination dataset :param bucket_name: the bucket in GCS containing the archive files :param hpo_id: Identifies the HPO site :param folder_name: Name of the submission folder to load :return: """ bq_client = get_client(project_id) gcs_client = GCSClient(project_id) site_bucket = get_bucket(bq_client, hpo_id) prefix = f'{hpo_id}/{site_bucket}/{folder_name}' LOGGER.info( f'Starting jobs for loading {bucket_name}/{prefix} into {dataset_id}') _ = load_folder(dataset_id, bq_client, bucket_name, prefix, gcs_client, hpo_id) LOGGER.info(f'Successfully loaded {bucket_name}/{prefix} into {dataset_id}')
def gcs(self): if self._gcs is None: self._gcs = GCSClient() return self._gcs
def storageclient(self): if self._storageclient is None: self._storageclient = GCSClient() return self._storageclient
def test_client_roundtrip(queue_connection, google_storage_bucket, orbits, observations): taskqueue_client = client.Client(google_storage_bucket, queue_connection) taskqueue_worker = client.Worker(GCSClient(), queue_connection) # trim down to 3 orbits orbits = Orbits.from_df(orbits.to_df()[:3]) n_task = 3 manifest = taskqueue_client.launch_job(test_config, observations, orbits) assert len(manifest.task_ids) == n_task statuses = taskqueue_client.get_task_statuses(manifest) assert len(statuses) == n_task assert all(s.state == tasks.TaskState.REQUESTED for s in statuses.values() ), "all tasks should initially be in 'requested' state" received_tasks = list( taskqueue_worker.poll_for_tasks(poll_interval=0.5, limit=5)) assert len(received_tasks) == n_task statuses = taskqueue_client.get_task_statuses(manifest) assert all(s.state == tasks.TaskState.IN_PROGRESS for s in statuses.values( )), "all tasks should be in 'in_progress' state once received" # Handle the first task. It should be marked as succeeded, but others still # in progress. taskqueue_worker.handle_task(received_tasks[0]) statuses = taskqueue_client.get_task_statuses(manifest) task1_state, task2_state, task3_state = ( statuses[received_tasks[0].task_id].state, statuses[received_tasks[1].task_id].state, statuses[received_tasks[2].task_id].state, ) assert task1_state == tasks.TaskState.SUCCEEDED assert task2_state == tasks.TaskState.IN_PROGRESS assert task3_state == tasks.TaskState.IN_PROGRESS # Download results. We should only have results for the first task. with tempfile.TemporaryDirectory( prefix="thor.test_client_roundtrip_1") as outdir: taskqueue_client.download_results(manifest, outdir) _assert_results_downloaded(outdir, received_tasks[0].task_id) # Handle another task. taskqueue_worker.handle_task(received_tasks[1]) statuses = tasks.get_task_statuses(google_storage_bucket, manifest) task1_state, task2_state, task3_state = ( statuses[received_tasks[0].task_id].state, statuses[received_tasks[1].task_id].state, statuses[received_tasks[2].task_id].state, ) assert task1_state == tasks.TaskState.SUCCEEDED assert task2_state == tasks.TaskState.SUCCEEDED assert task3_state == tasks.TaskState.IN_PROGRESS # Download results. Now we should have results for the first two tasks. with tempfile.TemporaryDirectory( prefix="thor.test_client_roundtrip_2") as outdir: taskqueue_client.download_results(manifest, outdir) _assert_results_downloaded(outdir, received_tasks[0].task_id) _assert_results_downloaded(outdir, received_tasks[1].task_id)