def test_load_id_no_confdir(local_gcp, mocked_confdir, mocked_alternate_confdir): shutil.rmtree(mocked_confdir) shutil.rmtree(mocked_alternate_confdir) alt_gcp = globus_sdk.LocalGlobusConnectPersonal( config_dir=mocked_alternate_confdir) assert local_gcp.endpoint_id is None assert alt_gcp.endpoint_id is None
def get_local_endpoint(self): """Fetches the local endpoint. Returns None if no local endpoint is available. Ensure that your globus connect personal is running! Returns: str: local endpoint id """ localEndpoint = globus_sdk.LocalGlobusConnectPersonal() return localEndpoint.endpoint_id
def test_localep_load_id_alternate_conf_dir(mocked_alternate_confdir, write_gcp_id_file): gcp = globus_sdk.LocalGlobusConnectPersonal( config_dir=normalize_config_dir_argument(mocked_alternate_confdir)) assert gcp.endpoint_id is None write_gcp_id_file("foobar", alternate=True) assert gcp.endpoint_id == "foobar" write_gcp_id_file("xyz", alternate=True) assert gcp.endpoint_id == "foobar" del gcp.endpoint_id assert gcp.endpoint_id == "xyz"
def login(refresh_tokens, force, local_server, browser): pc = pilot.commands.get_pilot_client() is_logged_in = pc.is_logged_in() if is_logged_in and not force: click.echo('You are already logged in.') return elif is_logged_in and force: pc.logout() prev_info = pc.profile.load_user_info() scopes = pc.context.get_value('scopes') or pc.DEFAULT_SCOPES pc.login(refresh_tokens=refresh_tokens, no_local_server=not local_server, no_browser=not browser, force=force, requested_scopes=scopes) if not pc.project.load_all(): log.debug('NO project info saved, updating...') pc.context.update_with_diff() click.secho('You have been logged in.', fg='green') local_ep = (pc.profile.load_option('local_endpoint') or globus_sdk.LocalGlobusConnectPersonal().endpoint_id) local_path = pc.profile.load_option('local_endpoint_path') log.debug('Local Endpoint set to {}:{}'.format(local_ep, local_path)) tc = pc.get_transfer_client() try: if local_ep: name = tc.get_endpoint(local_ep).data['display_name'] pc.profile.save_option('local_endpoint', local_ep) pc.profile.save_option('local_endpoint_path', local_path) pc.profile.save_option('local_endpoint_name', name) endpoint_utils.test_local_endpoint() except pilot.exc.LocalEndpointUnresponsive as leu: log.debug('Endpoint UUID: {}, local path: {}'.format( local_ep, local_path)) click.secho(str(leu), fg='yellow') if prev_info != pc.profile.load_user_info(): pitems = [('Name:', pc.profile.name), ('Organization:', pc.profile.organization), ('Local Endpoint:', pc.profile.load_option('local_endpoint_name'))] pstr = '\n'.join(['{:16}{}'.format(t, v) for t, v in pitems]) report = ('Your personal info has been saved as: \n{}\n\n' 'You can update these with "pilot profile -i"'.format(pstr)) click.secho(report, fg='blue')
def get_local_endpoint_id(): """ Get the endpoint ID of a local Globus Connect Personal endpoint. Returns ------- endpoint_id : `str` The endpoint ID. Raises ------ ConnectionError If no local endpoint can be detected a connection error is raised. """ local_endpoint = globus_sdk.LocalGlobusConnectPersonal() endpoint_id = local_endpoint.endpoint_id if not endpoint_id: raise ConnectionError( "Can not find a local Globus Connect endpoint.") return endpoint_id
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, globus=False, disable_validation=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. globus (bool): Should the data be transferred using Globus Transfer? Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ self.check() logger.debug("Startup: Validating input") catalogs = self.remote_config['CATALOGS'] if catalog_id in catalogs.keys(): if schema: raise ValueError("You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = catalogs[catalog_id] # Verify the dcc is valid if ':' not in dcc_id: dcc_id = f"cfde_registry_dcc:{dcc_id}" if not self.valid_dcc(dcc_id): raise exc.InvalidInput("Error: The dcc you've specified is not valid. Please double " "check the spelling and try again.") # Coerces the BDBag path to a .zip archive data_path = bdbag_utils.get_bag( data_path, output_dir=output_dir, delete_dir=delete_dir, handle_git_repos=handle_git_repos, bdbag_kwargs=kwargs ) # Raises exc.ValidationException if something doesn't match up with the schema if not disable_validation: validation.validate_user_submission(data_path, schema) flow_info = self.remote_config["FLOWS"][self.service_instance] dest_path = "{}{}".format(flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } logger.debug("Creating input for Flow") flow_input = { "cfde_ep_id": flow_info["cfde_ep_id"], "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # Transfer data via globus if globus: local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id logger.debug(f'Local endpoint: {local_endpoint}') if not local_endpoint: raise exc.EndpointUnavailable("Globus Connect Personal installation not found. To " "install, please visit " "https://www.globus.org/globus-connect-personal") try: self.transfer_client.operation_ls(local_endpoint, path=os.path.dirname(data_path)) logger.debug("Successfully connected to Globus Connect Personal endpoint " f"'{local_endpoint}'") except globus_sdk.exc.TransferAPIError as e: # Unable to connect if e.http_status == 502: raise exc.EndpointUnavailable(f"Unable to connect to local endpoint " f"'{local_endpoint}'. Please verify that Globus " "Connect Personal is running.") # Forbidden elif e.http_status == 403: raise exc.EndpointUnavailable(f"Unable to access '{data_path}' on local " f"endpoint '{local_endpoint}'. Please set the " "access preferences in Globus Connect Personal " "to permit access.") else: raise exc.EndpointUnavailable(e.message) # Populate Transfer fields in Flow flow_input.update({ "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_path": dest_path, "cfde_ep_url": flow_info["cfde_ep_url"], "is_directory": False, }) # Otherwise, HTTP PUT the BDBag on the server else: logger.debug("Uploading with HTTPS PUT") data_url = "{}{}".format(flow_info["cfde_ep_url"], dest_path) globus_http.upload(data_path, data_url, self.https_authorizer) flow_input.update({ "source_endpoint_id": False, "data_url": data_url, }) logger.debug("Flow input populated:\n{}".format(json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_id = flow_info["flow_id"] # Start Flow logger.debug("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, self.flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } logger.debug("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest flow\nYour dataset has been " "submitted\nYou can check the progress with: cfde-submit status\n"), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}" .format(flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }
def do_transfers(transfer): local_ep = globus_sdk.LocalGlobusConnectPersonal() local_ep_id = local_ep.endpoint_id p.opt["task_label"] = p.opt["archive_date_time"].strftime( p.opt["taskLabel"]) #p.opt["task_label"] = p.opt["task_label"].decode('utf-8') logging.info( f"Creating TransferData object with label '{p.opt['task_label']}'") #logging.info(f"task_label - {type(p.opt['task_label'])}") tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"], label=p.opt["task_label"]) #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"]) logging.info("\nBEGINNING PROCESSING OF archiveItems") for item, item_info in p.opt["archiveItems"].items(): logging.info(f"Starting on {item}") ii = copy.deepcopy(item_info) ii["key"] = item logging.verbose(f"Storing {item} as key") # substitute date/time strings and env variables in item info #logging.verbose(f"ii keys: {ii.keys()}") for ii_key in ("source", "destination", "tarFileName", "cdDirTar"): if ii.get(ii_key): logging.verbose(f"swapping {ii_key}: {ii[ii_key]}") ii[ii_key] = p.opt["archive_date_time"].strftime(ii[ii_key]) ii[ii_key] = os.path.expandvars(ii[ii_key]) logging.verbose(f"after swap {ii_key}: {ii[ii_key]}") # initialize number of files to 0 ii['num_files'] = 0 add_to_email(f"\nSOURCE: {ii['source']}\n") add_to_email(f"DESTINATION: {ii['destination']}\n") if "*" in ii["source"] or "?" in ii[ "source"]: # Is there a '*' or '?' in the source? logging.verbose(f"Found wildcard in source: {ii['source']}") expanded_sources = glob.glob(ii['source']) ii["glob"] = True if len(expanded_sources) == 0: log_and_email( f"Source expands to zero targets: {ii['source']}. SKIPPING!", logging.error) continue else: ii["glob"] = False if ii.get("glob") == True and not ii.get("tarFileName"): # can't handle both dirs and files in a glob file_glob = False dir_glob = False for es in expanded_sources: if os.path.isfile(es): file_glob = True if os.path.isdir(es): dir_glob = True if file_glob and dir_glob: log_and_email( f"glob: {ii['source']} expands to files and dirs. Not allowed. Skipping this archive item.", logging.error) continue for es_ix, es in enumerate(expanded_sources): # skip files that start with underscore if set to skip them if ii.get("skipUnderscoreFiles") and es.startswith('_'): continue ii["source"] = es # if not last item if es_ix != len(expanded_sources) - 1: ii["last_glob"] = False else: ii["last_glob"] = True if not prepare_and_add_transfer(transfer, tdata, ii): continue else: if not ii["glob"] and not os.path.exists(ii["source"]): log_and_email( f"{ii['source']} does not exist. Skipping this archive item.", logging.error) continue # setting last glob to True for tarring with a glob so expected file size/number is checked ii["last_glob"] = True if not prepare_and_add_transfer(transfer, tdata, ii): continue # submit all tasks for transfer if p.opt['submitTasks']: submit_transfer_task(transfer, tdata)
def upload(dataframe, destination, metadata, gcp, update, test, dry_run, verbose, no_analyze): """ Create a search entry and upload this file to the GCS Endpoint. # TODO: Fault tolerance for interrupted or failed file uploads (rollback) """ pc = pilot.commands.get_pilot_client() if not pc.is_logged_in(): click.echo('You are not logged in.') return if test: click.secho('Using test location: {}'.format(pc.TESTING_DIR), fg='yellow') click.secho('Using test index for Globus Search', fg='yellow') if not destination: path = pc.get_path('', '', test) dirs = pc.ls('', '', test) click.echo('No Destination Provided. Please select one from the ' 'directory "{}":\n{}'.format(path, '\t '.join(dirs))) return try: pc.ls(dataframe, destination, test) except globus_sdk.exc.TransferAPIError as tapie: if tapie.code == 'ClientError.NotFound': url = pc.get_globus_app_url('', test) click.secho('Directory does not exist: "{}"\nPlease create it at: ' '{}'.format(destination, url), err=True, bg='red') return 1 else: click.secho(tapie.message, err=True, bg='red') return 1 if metadata is not None: with open(metadata) as mf_fh: user_metadata = json.load(mf_fh) else: user_metadata = {} filename = os.path.basename(dataframe) prev_metadata = pc.get_search_entry(filename, destination, test) url = pc.get_globus_http_url(filename, destination, test) new_metadata = scrape_metadata(dataframe, url, no_analyze, test) try: new_metadata = update_metadata(new_metadata, prev_metadata, user_metadata) subject = pc.get_subject_url(filename, destination, test) gmeta = gen_gmeta(subject, pc.GROUP, new_metadata) except (RequiredUploadFields, ValidationError) as e: click.secho('Error Validating Metadata: {}'.format(e), fg='red') return 1 if json.dumps(new_metadata) == json.dumps(prev_metadata): click.secho( 'Files and search entry are an exact match. No update ' 'necessary.', fg='green') return 1 if prev_metadata and not update: last_updated = prev_metadata['dc']['dates'][-1]['date'] dt = datetime.datetime.strptime(last_updated, '%Y-%m-%dT%H:%M:%S.%fZ') click.echo('Existing record found for {}, specify -u to update.\n' 'Last updated: {: %A, %b %d, %Y}' ''.format(filename, dt)) return 1 if dry_run: click.echo('Success! (Dry Run -- No changes made.)') click.echo( 'Pre-existing record: {}'.format('yes' if prev_metadata else 'no')) click.echo('Version: {}'.format(new_metadata['dc']['version'])) click.echo('Search Subject: {}\nURL: {}'.format(subject, url)) if verbose: click.echo('Ingesting the following data:') click.echo(json.dumps(new_metadata, indent=2)) return click.echo('Ingesting record into search...') pc.ingest_entry(gmeta, test) click.echo('Success!') if prev_metadata and not files_modified(new_metadata['files'], prev_metadata['files']): click.echo('Metadata updated, dataframe is already up to date.') return if gcp: local_ep = globus_sdk.LocalGlobusConnectPersonal().endpoint_id if not local_ep: raise Exception('No local GCP client found') auth = pc.get_authorizers()['transfer.api.globus.org'] tc = globus_sdk.TransferClient(authorizer=auth) tdata = globus_sdk.TransferData(tc, local_ep, pc.ENDPOINT, label='{} Transfer'.format( pc.APP_NAME), notify_on_succeeded=False, sync_level='checksum', encrypt_data=True) path = pc.get_path(filename, destination, test) tdata.add_item(dataframe, path) click.echo('Starting Transfer...') transfer_result = tc.submit_transfer(tdata) short_path = os.path.join(destination, filename) pilot.config.config.add_transfer_log(transfer_result, short_path) click.echo('{}. You can check the status below: \n' 'https://app.globus.org/activity/{}/overview\n' 'URL will be: {}'.format(transfer_result['message'], transfer_result['task_id'], url)) else: click.echo('Uploading data...') response = pc.upload(dataframe, destination, test) if response.status_code == 200: click.echo('Upload Successful! URL is \n{}'.format(url)) else: click.echo('Failed with status code: {}'.format( response.status_code))
def local_gcp(): return globus_sdk.LocalGlobusConnectPersonal()
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, verbose=False, force_http=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. verbose (bool): Should intermediate status messages be printed out? Default False. Keyword Arguments: force_http (bool): Should the data be sent using HTTP instead of Globus Transfer, even if Globus Transfer is available? Because Globus Transfer is more robust than HTTP, it is highly recommended to leave this False. Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ self.check() if verbose: print("Startup: Validating input") catalogs = self.remote_config['CATALOGS'] if catalog_id in catalogs.keys(): if schema: raise ValueError( "You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = catalogs[catalog_id] # Coerces the BDBag path to a .zip archive data_path = validation.validate_user_submission( data_path, schema, output_dir=output_dir, delete_dir=delete_dir, handle_git_repos=handle_git_repos, bdbag_kwargs=kwargs) flow_info = self.remote_config["FLOWS"][self.service_instance] dest_path = "{}{}".format(flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } logger.debug("Creating input for Flow") flow_input = { "cfde_ep_id": flow_info["cfde_ep_id"], "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # If local EP exists (and not force_http), can use Transfer # Local EP fetched now in case GCP started after Client creation local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id logger.debug(f'Local endpoint: {local_endpoint}') if local_endpoint and not force_http: logger.debug( "Using local Globus Connect Personal Endpoint '{}'".format( local_endpoint)) # Populate Transfer fields in Flow flow_input.update({ "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_path": dest_path, "cfde_ep_url": flow_info["cfde_ep_url"], "is_directory": False, }) # Otherwise, we must PUT the BDBag on the server else: logger.debug("GCP Not installed, uploading with HTTP PUT instead") data_url = "{}{}".format(flow_info["cfde_ep_url"], dest_path) globus_http.upload(data_path, data_url, self.https_authorizer) flow_input.update({ "source_endpoint_id": False, "data_url": data_url, }) if verbose: print("Flow input populated:\n{}".format( json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_id = flow_info["flow_id"] # Start Flow if verbose: print("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, self.flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } if verbose: print("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest flow\nYour dataset has been " "submitted\nYou can check the progress with: cfde-submit status\n" ), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}". format(flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }
def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None, dataset_acls=None, output_dir=None, delete_dir=False, handle_git_repos=True, dry_run=False, test_sub=False, verbose=False, **kwargs): """Start the Globus Automate Flow to ingest CFDE data into DERIVA. Arguments: data_path (str): The path to the data to ingest into DERIVA. The path can be: 1) A directory to be formatted into a BDBag 2) A Git repository to be copied into a BDBag 3) A premade BDBag directory 4) A premade BDBag in an archive file dcc_id (str): The CFDE-recognized DCC ID for this submission. catalog_id (int or str): The ID of the DERIVA catalog to ingest into. Default None, to create a new catalog. schema (str): The named schema or schema file link to validate data against. Default None, to only validate against the declared TableSchema. server (str): The DERIVA server to ingest to. Default None, to use the Action Provider-set default. dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset. Default None, to use the CFDE default ACLs. output_dir (str): The path to create an output directory in. The resulting BDBag archive will be named after this directory. If not set, the directory will be turned into a BDBag in-place. For Git repositories, this is automatically set, but can be overridden. If data_path is a file, this has no effect. This dir MUST NOT be in the `data_path` directory or any subdirectories. Default None. delete_dir (bool): Should the output_dir be deleted after submission? Has no effect if output_dir is not specified. For Git repositories, this is always True. Default False. handle_git_repos (bool): Should Git repositories be detected and handled? When this is False, Git repositories are handled as simple directories instead of Git repositories. Default True. dry_run (bool): Should the data be validated and bagged without starting the Flow? When True, does not ingest into DERIVA or start the Globus Automate Flow, and the return value will not have valid DERIVA Flow information. Default False. test_sub (bool): Should the submission be run in "test mode" where the submission will be inegsted into DERIVA and immediately deleted? When True, the data wil not remain in DERIVA to be viewed and the Flow will terminate before any curation step. verbose (bool): Should intermediate status messages be printed out? Default False. Keyword Arguments: force_http (bool): Should the data be sent using HTTP instead of Globus Transfer, even if Globus Transfer is available? Because Globus Transfer is more robust than HTTP, it is highly recommended to leave this False. Default False. Other keyword arguments are passed directly to the ``make_bag()`` function of the BDBag API (see https://github.com/fair-research/bdbag for details). """ if verbose: print("Startup: Validating input") data_path = os.path.abspath(data_path) if not os.path.exists(data_path): raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) if catalog_id in self.catalogs.keys(): if schema: raise ValueError( "You may not specify a schema ('{}') when ingesting to " "a named catalog ('{}'). Retry without specifying " "a schema.".format(schema, catalog_id)) schema = self.catalogs[catalog_id] # Pull out known kwargs force_http = kwargs.pop("force_http", False) if handle_git_repos: if verbose: print("Checking for a Git repository") # If Git repo, set output_dir appropriately try: repo = git.Repo(data_path, search_parent_directories=True) # Not Git repo except git.InvalidGitRepositoryError: if verbose: print("Not a Git repo") # Path not found, turn into standard FileNotFoundError except git.NoSuchPathError: raise FileNotFoundError( "Path '{}' does not exist".format(data_path)) # Is Git repo else: if verbose: print("Git repo found, collecting metadata") # Needs to not have slash at end - is known Git repo already, slash # interferes with os.path.basename/dirname if data_path.endswith("/"): data_path = data_path[:-1] # Set output_dir to new dir named with HEAD commit hash new_dir_name = "{}_{}".format(os.path.basename(data_path), str(repo.head.commit)) output_dir = os.path.join(os.path.dirname(data_path), new_dir_name) # Delete temp dir after archival delete_dir = True # If dir and not already BDBag, make BDBag if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path): if verbose: print("Creating BDBag out of directory '{}'".format(data_path)) # If output_dir specified, copy data to output dir first if output_dir: if verbose: print("Copying data to '{}' before creating BDBag".format( output_dir)) output_dir = os.path.abspath(output_dir) # If shutil.copytree is called when the destination dir is inside the source dir # by more than one layer, it will recurse infinitely. # (e.g. /source => /source/dir/dest) # Exactly one layer is technically okay (e.g. /source => /source/dest), # but it's easier to forbid all parent/child dir cases. # Check for this error condition by determining if output_dir is a child # of data_path. if os.path.commonpath([data_path]) == os.path.commonpath( [data_path, output_dir]): raise ValueError( "The output_dir ('{}') must not be in data_path ('{}')" .format(output_dir, data_path)) try: shutil.copytree(data_path, output_dir) except FileExistsError: raise FileExistsError( ("The output directory must not exist. " "Delete '{}' to submit.\nYou can set delete_dir=True " "to avoid this issue in the future." ).format(output_dir)) # Process new dir instead of old path data_path = output_dir # If output_dir not specified, never delete data dir else: delete_dir = False # Make bag bdbag_api.make_bag(data_path, **kwargs) if not bdbag_api.is_bag(data_path): raise ValueError( "Failed to create BDBag from {}".format(data_path)) elif verbose: print("BDBag created at '{}'".format(data_path)) # If dir (must be BDBag at this point), archive if os.path.isdir(data_path): if verbose: print("Archiving BDBag at '{}' using '{}'".format( data_path, CONFIG["ARCHIVE_FORMAT"])) new_data_path = bdbag_api.archive_bag(data_path, CONFIG["ARCHIVE_FORMAT"]) if verbose: print("BDBag archived to file '{}'".format(new_data_path)) # If requested (e.g. Git repo copied dir), delete data dir if delete_dir: if verbose: print("Removing old directory '{}'".format(data_path)) shutil.rmtree(data_path) # Overwrite data_path - don't care about dir for uploading data_path = new_data_path # Validate TableSchema in BDBag if verbose: print("Validating TableSchema in BDBag '{}'".format(data_path)) validation_res = ts_validate(data_path, schema=schema) if not validation_res["is_valid"]: return { "success": False, "error": ("TableSchema invalid due to the following errors: \n{}\n". format(validation_res["error"])) } elif verbose: print("Validation successful") # Now BDBag is archived file # Set path on destination dest_path = "{}{}".format(self.flow_info["cfde_ep_path"], os.path.basename(data_path)) # If doing dry run, stop here before making Flow input if dry_run: return { "success": True, "message": "Dry run validated successfully. No data was transferred." } # Set up Flow if verbose: print("Creating input for Flow") # If local EP exists (and not force_http), can use Transfer # Local EP fetched now in case GCP started after Client creation local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id if local_endpoint and not force_http: if verbose: print( "Using local Globus Connect Personal Endpoint '{}'".format( local_endpoint)) # Populate Transfer fields in Flow flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": local_endpoint, "source_path": data_path, "cfde_ep_id": self.flow_info["cfde_ep_id"], "cfde_ep_path": dest_path, "cfde_ep_url": self.flow_info["cfde_ep_url"], "is_directory": False, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server # Otherwise, we must PUT the BDBag on the server else: if verbose: print("No Globus Endpoint detected; using HTTP upload instead") headers = {} self.__https_authorizer.set_authorization_header(headers) data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path) with open(data_path, 'rb') as bag_file: bag_data = bag_file.read() put_res = requests.put(data_url, data=bag_data, headers=headers) # Regenerate headers on 401 if put_res.status_code == 401: self.__https_authorizer.handle_missing_authorization() self.__https_authorizer.set_authorization_header(headers) put_res = requests.put(data_url, data=bag_data, headers=headers) # Error message on failed PUT or any unexpected response if put_res.status_code >= 300: return { "success": False, "error": ("Could not upload BDBag to server (error {}):\n{}".format( put_res.status_code, put_res.content)) } elif put_res.status_code != 200: print( "Warning: HTTP upload returned status code {}, which was unexpected." .format(put_res.status_code)) if verbose: print("Upload successful to '{}': {} {}".format( data_url, put_res.status_code, put_res.content)) flow_id = self.flow_info["flow_id"] flow_input = { "source_endpoint_id": False, "data_url": data_url, "test_sub": test_sub, "dcc_id": dcc_id } if catalog_id: flow_input["catalog_id"] = str(catalog_id) if server: flow_input["server"] = server if verbose: print("Flow input populated:\n{}".format( json.dumps(flow_input, indent=4, sort_keys=True))) # Get Flow scope flow_def = self.flow_client.get_flow(flow_id) flow_scope = flow_def["globus_auth_scope"] # Start Flow if verbose: print("Starting Flow - Submitting data") try: flow_res = self.flow_client.run_flow(flow_id, flow_scope, flow_input) except globus_sdk.GlobusAPIError as e: if e.http_status == 404: return { "success": False, "error": ("Could not access ingest Flow. Are you in the CFDE DERIVA " "Demo Globus Group? Check your membership or apply for access " "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-" "0efb3ba9a670/about") } else: raise self.last_flow_run = { "flow_id": flow_id, "flow_instance_id": flow_res["action_id"] } if verbose: print("Flow started successfully.") return { "success": True, "message": ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}". format(flow_id, flow_res["action_id"])), "flow_id": flow_id, "flow_instance_id": flow_res["action_id"], "cfde_dest_path": dest_path, "http_link": "{}{}".format(self.flow_info["cfde_ep_url"], dest_path), "globus_web_link": ("https://app.globus.org/file-manager?origin_id={}&origin_path={}". format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path))) }
def getTokens(): tokens = None try: # if we already have tokens, load and use them tokens = load_tokens_from_file(p.opt["globusTokenFile"]) except: pass if not tokens: # if we need to get tokens, start the Native App authentication process tokens = do_native_app_authentication(CLIENT_ID, REDIRECT_URI, SCOPES) try: save_tokens_to_file(p.opt["globusTokenFile"], tokens) except: pass transfer_tokens = tokens['transfer.api.globus.org'] auth_client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID) authorizer = globus_sdk.RefreshTokenAuthorizer( transfer_tokens['refresh_token'], auth_client, access_token=transfer_tokens['access_token'], expires_at=transfer_tokens['expires_at_seconds'], on_refresh=update_tokens_file_on_refresh) transfer = globus_sdk.TransferClient(authorizer=authorizer) myproxy_lifetime = 720 #in hours. What's the maximum? try: r = transfer.endpoint_autoactivate(p.opt["archiveEndPoint"], if_expires_in=3600) while (r["code"] == "AutoActivationFailed"): print( "Endpoint requires manual activation, please use your UCAS name/password for this activation. " "You can activate via the command line or via web browser:\n" "WEB BROWSER -- Open the following URL in a browser to activate the " "endpoint:") print( f"https://app.globus.org/file-manager?origin_id={p.opt['archiveEndPoint']}" ) print("CMD LINE -- run this from your shell: ") print( f"globus endpoint activate --myproxy --myproxy-lifetime {myproxy_lifetime} {p.opt['archiveEndPoint']}" ) input("Press ENTER after activating the endpoint:") r = tc.endpoint_autoactivate(ep_id, if_expires_in=3600) except globus_sdk.exc.GlobusAPIError as ex: print("endpoint_autoactivation failed.") print(ex) if ex.http_status == 401: sys.exit('Refresh token has expired. ' 'Please delete refresh-tokens.json and try again.') else: raise ex # print out a directory listing from an endpoint #print("Looking at archive end point") #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'): # print(entry['name'] + ('/' if entry['type'] == 'dir' else '')) # revoke the access token that was just used to make requests against # the Transfer API to demonstrate that the RefreshTokenAuthorizer will # automatically get a new one #auth_client.oauth2_revoke_token(authorizer.access_token) # Allow a little bit of time for the token revocation to settle #time.sleep(1) # Verify that the access token is no longer valid #token_status = auth_client.oauth2_validate_token( # transfer_tokens['access_token']) #assert token_status['active'] is False, 'Token was expected to be invalid.' #print('\nDoing a second directory listing with a new access token:') #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'): # print(entry['name'] + ('/' if entry['type'] == 'dir' else '')) local_ep = globus_sdk.LocalGlobusConnectPersonal() local_ep_id = local_ep.endpoint_id #print("Looking at local end point") #for entry in transfer.operation_ls(local_ep_id): # print(f"Local file: {entry['name']}") logging.info("BEGINNING PROCESSING OF archiveItems") for item, item_info in p.opt["archiveItems"].items(): logging.info(f"Transferring {item}") if not item_info["source"].startswith('/'): logging.error( f"{item} source: {item_info['source']} must be absolute. SKIPPING!" ) continue if not item_info["destination"].startswith('/'): logging.error( f"{item} source: {item_info['destination']} must be absolute. SKIPPING!" ) continue try: transfer.operation_ls(p.opt["archiveEndPoint"], path=item_info["destination"]) except globus_sdk.exc.TransferAPIError as e: logging.fatal( f"Destination path ({item_info['destination']}) does not exist on archiveEndPoint." ) logging.fatal(e) sys.exit(1) # get leaf dir from source, and add it to destination dirname, leaf = os.path.split(item_info['source']) if leaf == '': _, leaf = os.path.split(dirname) destination_directory = os.path.join(item_info['destination'], leaf) + '/' # Check if destination_dir already exists, and skip if so # TODO: add support to overwrite? try: transfer.operation_ls(p.opt["archiveEndPoint"], path=destination_directory) logging.error( f"Destination {destination_directory} already exists on archiveEndPoint. SKIPPING!" ) continue except globus_sdk.exc.TransferAPIError as e: if e.code != u'ClientError.NotFound': logging.fatal( f"Can't ls {p.opt['archiveEndPoint']} : {destination_directory}" ) logging.fatal(e) sys.exit(1) # create destination directory try: logging.info( f"Creating destination directory {destination_directory}") transfer.operation_mkdir(p.opt["archiveEndPoint"], destination_directory) except globus_sdk.exc.TransferAPIError as e: logging.fatal( f"Can't mkdir {p.opt['archiveEndPoint']} : {destination_directory}" ) logging.fatal(e) sys.exit(1) # TODO: set permissions for users to read dir # look at https://github.com/globus/automation-examples/blob/master/share_data.py #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"], label=item_info["transfer-label"]) tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"]) tdata.add_item(item_info["source"], destination_directory, recursive=True) try: logging.info( f"Submitting transfer task - {item_info['transfer-label']}") task = transfer.submit_transfer(tdata) except globus_sdk.exc.TransferAPIError as e: logging.fatal("Transfer task submission failed") logging.fatal(e) sys.exit(1) logging.info(f"Task ID: {task['task_id']}") logging.info( f"This task can be monitored via the Web UI: https://app.globus.org/activity/{task['task_id']}" )