def __getTransferClient(self): self.transferClient = globus_sdk.TransferClient(authorizer=self.authorizer)
auth_code = input('Please enter the code you get after login here: ').strip() token_response = client.oauth2_exchange_code_for_tokens(auth_code) # the useful values that you want at the end of this globus_auth_data = token_response.by_resource_server['auth.globus.org'] globus_transfer_data = token_response.by_resource_server[ 'transfer.api.globus.org'] globus_auth_token = globus_auth_data['access_token'] globus_transfer_token = globus_transfer_data['access_token'] # a GlobusAuthorizer is an auxiliary object we use to wrap the token. In # more advanced scenarios, other types of GlobusAuthorizers give us # expressive power authorizer = globus_sdk.AccessTokenAuthorizer(globus_transfer_token) tc = globus_sdk.TransferClient(authorizer=authorizer) terraref = tc.get_endpoint(parser.get("globus", "terraref_endpoint")) workbench = tc.get_endpoint(parser.get("globus", "workbench_endpoint")) print("Terraref Endpoint name:", terraref["display_name"] or terraref["canonical_name"]) for dir in working_dirs: for date in dates: dir_to_transfer = "/".join([root_dir, dir, date]) print(dir_to_transfer) tdata = globus_sdk.TransferData(tc, source_endpoint=terraref["id"], destination_endpoint=workbench["id"],
def login(credentials=None, clear_old_tokens=False, **kwargs): """Login to Globus services Arguments: credentials (str or dict): A string filename, string JSON, or dictionary with credential and config information. By default, looks in ~/mdf/credentials/globus_login.json. Contains: app_name (str): Name of script/client. This will form the name of the token cache file. services (list of str): Services to authenticate with (can be transfer, search, search_ingest, or mdf). index: The default Search index. Only required if services contains 'search' or 'search_ingest'. clear_old_tokens (bool): If True, delete old token file if it exists, forcing user to re-login. If False, use existing token file if there is one. Default False. Returns: dict: The clients and authorizers requested, indexed by service name. For example, if login() is told to auth with 'search' then the search client will be in the 'search' field. """ NATIVE_CLIENT_ID = "98bfc684-977f-4670-8669-71f8337688e4" DEFAULT_CRED_FILENAME = "globus_login.json" DEFAULT_CRED_PATH = os.path.expanduser("~/mdf/credentials") SCOPES = { "transfer": "urn:globus:auth:scope:transfer.api.globus.org:all", "search": "urn:globus:auth:scope:search.api.globus.org:search", "search_ingest": "urn:globus:auth:scope:search.api.globus.org:all", "mdf": "urn:globus:auth:scope:data.materialsdatafacility.org:all" # urn:globus:auth:scope:api.materialsdatafacility.org:all" } def _get_tokens(client, scopes, app_name, force_refresh=False): token_path = os.path.join(DEFAULT_CRED_PATH, app_name + "_tokens.json") if force_refresh: if os.path.exists(token_path): os.remove(token_path) if os.path.exists(token_path): with open(token_path, "r") as tf: tokens = json.load(tf) else: os.makedirs(DEFAULT_CRED_PATH, exist_ok=True) client.oauth2_start_flow(requested_scopes=scopes, refresh_tokens=True) authorize_url = client.oauth2_get_authorize_url() print( "It looks like this is the first time you're accessing this client.\nPlease log in to Globus at this link:\n", authorize_url) auth_code = input( "Copy and paste the authorization code here: ").strip() print("Thanks!") token_response = client.oauth2_exchange_code_for_tokens(auth_code) tokens = token_response.by_resource_server os.umask(0o077) with open(token_path, "w") as tf: json.dump(tokens, tf) return tokens if type(credentials) is str: try: with open(credentials) as cred_file: creds = json.load(cred_file) except IOError: try: creds = json.loads(credentials) except json.JSONDecodeError: raise ValueError("Credential string unreadable") elif type(credentials) is dict: creds = credentials else: try: with open(os.path.join(os.getcwd(), DEFAULT_CRED_FILENAME)) as cred_file: creds = json.load(cred_file) except IOError: try: with open( os.path.join(DEFAULT_CRED_PATH, DEFAULT_CRED_FILENAME)) as cred_file: creds = json.load(cred_file) except IOError: raise ValueError( "Credentials/configuration must be passed as a filename string, JSON string, or dictionary, or provided in '" + DEFAULT_CRED_FILENAME + "' or '" + DEFAULT_CRED_PATH + "'.") native_client = globus_sdk.NativeAppAuthClient(NATIVE_CLIENT_ID, app_name=creds["app_name"]) servs = [] for serv in creds.get("services", []): serv = serv.lower().strip() if type(serv) is str: servs += serv.split(" ") else: servs += list(serv) scopes = " ".join([SCOPES[sc] for sc in servs]) all_tokens = _get_tokens(native_client, scopes, creds["app_name"], force_refresh=clear_old_tokens) clients = {} if "transfer" in servs: transfer_authorizer = globus_sdk.RefreshTokenAuthorizer( all_tokens["transfer.api.globus.org"]["refresh_token"], native_client) clients["transfer"] = globus_sdk.TransferClient( authorizer=transfer_authorizer) if "search_ingest" in servs: ingest_authorizer = globus_sdk.RefreshTokenAuthorizer( all_tokens["search.api.globus.org"]["refresh_token"], native_client) clients["search_ingest"] = SearchClient( default_index=(creds.get("index", None) or kwargs.get("index", None)), authorizer=ingest_authorizer) elif "search" in servs: search_authorizer = globus_sdk.RefreshTokenAuthorizer( all_tokens["search.api.globus.org"]["refresh_token"], native_client) clients["search"] = SearchClient( default_index=(creds.get("index", None) or kwargs.get("index", None)), authorizer=search_authorizer) if "mdf" in servs: mdf_authorizer = globus_sdk.RefreshTokenAuthorizer( all_tokens["data.materialsdatafacility.org"]["refresh_token"], native_client) clients["mdf"] = mdf_authorizer return clients
def login(globus_client_id): token = _login(globus_client_id, refresh_tokens=False) authorizer = globus.AccessTokenAuthorizer(token['transfer_token']) tc = globus.TransferClient(authorizer=authorizer) return tc
def _authTransferStart(self): """ _authTransferStart(self) DESCRIPTION: This function handles all of the administrative details of initializing self's authorization and transfer client objects. Globus requires a specific set of steps to be completed in order to authorize transfer requests. The user may be required to visit some URLs in order to retrieve some authorization tokens for the Globus Python SDK. ARGUMENTS: self EFFECTS: Creates two files, self._refreshPath and self._clientIDPath if needed in order to save the refresh token and the Globus client ID. RETURN: tc (TransferClient) -- The TransferClient object returned by Globus. This is used to submit transfer requests. authorizer (RefreshTokenAuthorizer) -- The Authorizer object from Globus used to create the transfer client. """ if not os.path.isfile(self._clientIDPath): URL = "http://globus-sdk-python.readthedocs.io/en/latest/tutorial/" print( "Please go to this URL and follow steps 1 and 2 to obtain a Client ID: {}" .format(URL)) self._clientID = raw_input("Please enter the Client ID: ") self._clientID.strip() # Save the Client ID in a file with open(self._clientIDPath, "w") as f: f.write(self._clientID) else: # Open the Client ID file and read it in with open(self._clientIDPath, "r") as f: self._clientID = f.readline().strip() client = globus_sdk.NativeAppAuthClient(self._clientID) if not self.refreshIsValid(): # The refresh token either doesn't exist or it's not valid client.oauth2_start_flow(refresh_tokens=True) print('Please go to this URL and login: {0}'.format( client.oauth2_get_authorize_url())) get_input = getattr(__builtins__, 'raw_input', input) auth_code = get_input('Please enter the code here: ').strip() token_response = client.oauth2_exchange_code_for_tokens(auth_code) # Get the data from the transfer API globus_transfer_data = token_response.by_resource_server[ 'transfer.api.globus.org'] # Get the refresh token transfer_rt = globus_transfer_data['refresh_token'] # Save the refresh token with open(self._refreshPath, 'w') as f: f.write(transfer_rt) # Get the access token transfer_at = globus_transfer_data['access_token'] # Get the expiration time expires_at_s = globus_transfer_data['expires_at_seconds'] # Now we've got the data we need, but what do we do? # That "GlobusAuthorizer" from before is about to come to the rescue # Create a refresh authorizer authorizer = globus_sdk.RefreshTokenAuthorizer( transfer_rt, client, access_token=transfer_at, expires_at=expires_at_s) # and try using `tc` to make TransferClient calls. Everything should just # work -- for days and days, months and months, even years tc = globus_sdk.TransferClient(authorizer=authorizer) # Prompt the user if the endpoints need to be activated self._autoActivate(self._srcEndpoint, tc) self._autoActivate(self._destEndpoint, tc) else: refreshToken = self._getRefreshToken() # authResponse is an authorizer... authorizer = globus_sdk.RefreshTokenAuthorizer( refreshToken, client) tc = globus_sdk.TransferClient(authorizer=authorizer) return {'tc': tc, 'authorizer': authorizer}
def confidential_login(credentials=None): """Login to Globus services as a confidential client (a client with its own login information). Arguments: credentials (str or dict): A string filename, string JSON, or dictionary with credential and config information. By default, looks in ~/mdf/credentials/confidential_globus_login.json. Contains: client_id (str): The ID of the client. client_secret (str): The client's secret for authentication. services (list of str): Services to authenticate with (can be transfer, search, search_ingest, or mdf). index: The default Search index. Only required if services contains 'search' or 'search_ingest'. Returns: dict: The clients and authorizers requested, indexed by service name. For example, if login() is told to auth with 'search' then the search client will be in the 'search' field. """ DEFAULT_CRED_FILENAME = "confidential_globus_login.json" DEFAULT_CRED_PATH = os.path.expanduser("~/mdf/credentials") SCOPES = { "transfer": "urn:globus:auth:scope:transfer.api.globus.org:all", "search": "urn:globus:auth:scope:search.api.globus.org:search", "search_ingest": "urn:globus:auth:scope:search.api.globus.org:all", "mdf": "urn:globus:auth:scope:data.materialsdatafacility.org:all" # urn:globus:auth:scope:api.materialsdatafacility.org:all" } # Read credentials if type(credentials) is str: try: with open(credentials) as cred_file: creds = json.load(cred_file) except IOError: try: creds = json.loads(credentials) except json.JSONDecodeError: raise ValueError("Credentials unreadable or missing") elif type(credentials) is dict: creds = credentials else: try: with open(os.path.join(os.getcwd(), DEFAULT_CRED_FILENAME)) as cred_file: creds = json.load(cred_file) except IOError: try: with open( os.path.join(DEFAULT_CRED_PATH, DEFAULT_CRED_FILENAME)) as cred_file: creds = json.load(cred_file) except IOError: raise ValueError( "Credentials/configuration must be passed as a filename string, JSON string, or dictionary, or provided in '" + DEFAULT_CRED_FILENAME + "' or '" + DEFAULT_CRED_PATH + "'.") conf_client = globus_sdk.ConfidentialAppAuthClient(creds["client_id"], creds["client_secret"]) servs = [] for serv in creds["services"]: serv = serv.lower().strip() if type(serv) is str: servs += serv.split(" ") else: servs += list(serv) scopes = " ".join([SCOPES[sc] for sc in servs]) conf_authorizer = globus_sdk.ClientCredentialsAuthorizer( conf_client, scopes) clients = {} if "transfer" in servs: clients["transfer"] = globus_sdk.TransferClient( authorizer=conf_authorizer) if "search_ingest" in servs: clients["search_ingest"] = SearchClient(default_index=creds.get( "index", None), authorizer=conf_authorizer) elif "search" in servs: clients["search"] = SearchClient(default_index=creds.get( "index", None), authorizer=conf_authorizer) if "mdf" in servs: clients["mdf"] = conf_authorizer return clients
def direct( files, force=False, local_path_prefix=sdconfig.sandbox_folder, verify_checksum=False, network_bandwidth_test=False, debug=True, verbosity=0): """ Returns: a list of files that cannot be transferred by Globus because they haven't been published with globus: or gsiftp: access. After all Globus transfer jobs are complete, Synda will download the files using the HTTP protocol. """ globus_transfers = {} """ globus_transfers = { <src_endpoint>: { "items": [ { "src_path": <src_path>, "dst_path": <dst_path> }... ], "task_id": <task_id> } } """ non_globus_files = [] for file_ in files: if file_.get("attached_parameters").get("protocol") != sdconst.TRANSFER_PROTOCOL_GLOBUS: non_globus_files.append(file_) continue src_endpoint, src_path, path = map_to_globus(file_.get("url")) if src_endpoint is None: non_globus_files.append(file_) continue dst_path = os.path.join(dst_directory, file_.get("dataset_path"), file_.get("filename")) if src_endpoint not in globus_transfers: globus_transfers[src_endpoint] = {"task_id": None, "items": []} globus_transfers.get(src_endpoint).get("items").append({ "src_path": src_path, "dst_path": dst_path }) sdlog.info("SDDMGLOB-001", "src_endpoint: %s, src_path: %s, dst_path: %s" % (src_endpoint, src_path, dst_path)) # create a TransferClient object authorizer = get_native_app_authorizer(client_id=client_id) tc = globus_sdk.TransferClient(authorizer=authorizer) for src_endpoint in globus_transfers: # activate the ESGF endpoint resp = tc.endpoint_autoactivate(src_endpoint, if_expires_in=36000) if resp["code"] == "AutoActivationFailed": requirements_data = fill_delegate_proxy_activation_requirements( resp.data, sdconfig.esgf_x509_proxy) r = tc.endpoint_activate(src_endpoint, requirements_data) if r["code"] != "Activated.ClientProxyCredential": sdlog.error("SDGLOBUS-028", "Error: Cannot activate the source endpoint: (%s)" % src_endpoint) raise FatalException() # submit a transfer job td = globus_sdk.TransferData(tc, src_endpoint, dst_endpoint) for item in globus_transfers.get(src_endpoint).get("items"): td.add_item(item.get("src_path"), item.get("dst_path")) try: task = tc.submit_transfer(td) task_id = task.get("task_id") print("Submitted Globus transfer: {}".format(task_id)) globus_transfers.get(src_endpoint)["task_id"] = task_id except Exception as e: raise Exception("Globus transfer from {} to {} failed due to error: {}".format( src_endpoint, dst_endpoint, e)) # monitor the transfer jobs threads = [] for src_endpoint in globus_transfers: task_id = globus_transfers.get(src_endpoint).get("task_id") thread = threading.Thread(target=globus_wait, args=(tc, task_id, src_endpoint,)) threads.append(thread) thread.start() for thread in threads: thread.join() return non_globus_files
def share_data(args): user_source_endpoint = args.source_endpoint or source_endpoint user_shared_endpoint = args.shared_endpoint or shared_endpoint if not user_shared_endpoint: eprint('Invalid shared endpoint') sys.exit(1) user_source_path = args.source_path or source_path user_destination_path = args.destination_path or destination_path if not user_source_path.startswith('/'): eprint('Source path must be absolute') sys.exit(1) if not user_destination_path.startswith('/'): eprint('Destination path must be absolute') sys.exit(1) if args.auth == 'native': # get an authorizer if it is a Native App authorizer = get_native_app_authorizer(client_id=CLIENT_ID) elif args.auth == 'client-credentials': secret = args.client_secret or CLIENT_SECRET if not secret: eprint('--auth client-credentials chosen, but no secret provided!' ' Set "--client-secret <your secret>"') sys.exit(1) # get an authorizer if it is a Confidential App authorizer = get_confidential_app_authorizer(client_id=CLIENT_ID, client_secret=secret) else: raise ValueError('Invalid Authenticator, this script only understands ' 'Native and Client Credential') # look for an identity uuid for the specified identity username username_uuid = None if args.username: ac = globus_sdk.AuthClient(authorizer=authorizer) r = ac.get_identities(usernames=args.username) if not len(r['identities']): eprint('No such identity username \'{}\''.format(args.username)) exit(1) username_uuid = r['identities'][0]['id'] # create a TransferClient object tc = globus_sdk.TransferClient(authorizer=authorizer) # check if a destination directory exists at all try: tc.operation_ls(user_shared_endpoint, path=user_destination_path) except TransferAPIError as e: eprint(e) sys.exit(1) dirname, leaf = os.path.split(user_source_path) if leaf == '': _, leaf = os.path.split(dirname) destination_directory = os.path.join(user_destination_path, leaf) + '/' """ check if a directory with the same name was already transferred to the destination path if it was and --delete option is specified, delete the directory """ try: tc.operation_ls(user_shared_endpoint, path=destination_directory) if not args.delete: eprint('Destination directory exists. Delete the directory or ' 'use --delete option') sys.exit(1) print('Destination directory, {}, exists and will be deleted'.format( destination_directory)) ddata = globus_sdk.DeleteData(tc, user_shared_endpoint, label='Share Data Example', recursive=True) ddata.add_item(destination_directory) print('Submitting a delete task') task = tc.submit_delete(ddata) print('\ttask_id: {}'.format(task['task_id'])) tc.task_wait(task['task_id']) except TransferAPIError as e: if e.code != u'ClientError.NotFound': eprint(e) sys.exit(1) # create a destination directory try: print( 'Creating destination directory {}'.format(destination_directory)) tc.operation_mkdir(user_shared_endpoint, destination_directory) except TransferAPIError as e: eprint(e) sys.exit(1) # grant group/user read access to the destination directory if args.user_uuid: rule_data = { "DATA_TYPE": "access", "principal_type": "identity", "principal": args.user_uuid, "path": destination_directory, "permissions": "r", } try: print( 'Granting user, {}, read access to the destination directory'. format(args.user_uuid)) tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data) except TransferAPIError as e: if e.code != u'Exists': eprint(e) sys.exit(1) if username_uuid: rule_data = { "DATA_TYPE": "access", "principal_type": "identity", "principal": username_uuid, "path": destination_directory, "permissions": "r", } try: print( 'Granting user, {}, read access to the destination directory'. format(username_uuid)) tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data) except TransferAPIError as e: if e.code != u'Exists': eprint(e) sys.exit(1) if args.group_uuid: rule_data = { "DATA_TYPE": "access", "principal_type": "group", "principal": args.group_uuid, "path": destination_directory, "permissions": "r", } try: print('Granting group, {}, read access to '.format( args.group_uuid)) tc.add_endpoint_acl_rule(user_shared_endpoint, rule_data) except TransferAPIError as e: if e.code != u'Exists': eprint(e) sys.exit(1) # transfer data - source directory recursively tdata = globus_sdk.TransferData(tc, user_source_endpoint, user_shared_endpoint, label='Share Data Example') tdata.add_item(user_source_path, destination_directory, recursive=True) try: print('Submitting a transfer task') task = tc.submit_transfer(tdata) except TransferAPIError as e: eprint(e) sys.exit(1) print('\ttask_id: {}'.format(task['task_id'])) print('You can monitor the transfer task programmatically using Globus SDK' ', or go to the Web UI, https://www.globus.org/app/activity/{}.'. format(task['task_id']))
def clean(): # constants SDK_USER_ID = "84942ca8-17c4-4080-9036-2f58e0093869" GO_EP1_ID = "ddb59aef-6d04-11e5-ba46-22000b92c6ec" GO_EP2_ID = "ddb59af0-6d04-11e5-ba46-22000b92c6ec" # TODO: remove EP3 when EP1 and EP2 support symlinks GO_EP3_ID = "4be6107f-634d-11e7-a979-22000bf2d287" CLIENT_ID = 'd0f1d9b0-bd81-4108-be74-ea981664453a' SCOPES = 'urn:globus:auth:scope:transfer.api.globus.org:all' get_input = getattr(__builtins__, 'raw_input', input) # create an authorized transfer client client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID) client.oauth2_start_flow(requested_scopes=SCOPES) url = client.oauth2_get_authorize_url() print("Login with SDK Tester: \n{0}".format(url)) auth_code = get_input("Enter auth code: ").strip() # get tokens and make a transfer client tokens = client.oauth2_exchange_code_for_tokens( auth_code).by_resource_server globus_transfer_data = tokens['transfer.api.globus.org'] transfer_rt = globus_transfer_data['refresh_token'] transfer_at = globus_transfer_data['access_token'] expires_at_s = globus_transfer_data['expires_at_seconds'] authorizer = globus_sdk.RefreshTokenAuthorizer(transfer_rt, client, access_token=transfer_at, expires_at=expires_at_s) tc = globus_sdk.TransferClient(authorizer=authorizer) # prevent accidental cleaning of a personal account auth_client = globus_sdk.AuthClient(authorizer=authorizer) res = auth_client.get('/p/whoami') if res['identities'][0]["id"] != SDK_USER_ID: # assume the primary ID print("The primary ID was not the SDK Tester, stopping clean") return # now clean test assets # clean SDK Tester's home /~/ on go#ep1 go#ep2 and go#ep3 ep_ids = [GO_EP1_ID, GO_EP2_ID, GO_EP3_ID] task_ids = [] file_deletions = 0 for ep_id in ep_ids: kwargs = {"notify_on_succeeded": False} # prevent email spam ddata = globus_sdk.DeleteData(tc, ep_id, recursive=True, **kwargs) r = tc.operation_ls(ep_id) for item in r: ddata.add_item("/~/" + item["name"]) print("deleting {0}: {1}".format(item["type"], item["name"])) file_deletions += 1 if len(ddata["DATA"]): r = tc.submit_delete(ddata) task_ids.append(r["task_id"]) # clean SDK Tester's bookmarks bookmark_deletions = 0 r = tc.bookmark_list() for bookmark in r: tc.delete_bookmark(bookmark["id"]) print("deleting bookmark: {0}".format(bookmark["name"])) bookmark_deletions += 1 # clean endpoints owned by SDK Tester endpoint_deletions = 0 cleaning = True while (cleaning): cleaning = False r = tc.endpoint_search(filter_scope="my-endpoints", num_results=None) for ep in r: tc.delete_endpoint(ep["id"]) print("deleting endpoint: {0}".format(ep["display_name"])) endpoint_deletions += 1 cleaning = True # wait for deletes to complete for task_id in task_ids: tc.task_wait(task_id, polling_interval=1) print("{0} files or folders cleaned".format(file_deletions)) print("{0} endpoints cleaned".format(endpoint_deletions)) print("{0} bookmarks cleaned".format(bookmark_deletions))
def get_globus_tc(transfer_token): authorizer = globus_sdk.AccessTokenAuthorizer(transfer_token) tc = globus_sdk.TransferClient(authorizer=authorizer) return tc
def client(): return globus_sdk.TransferClient()
def main(args): # Obtain Globus tokens cli = NativeClient(client_id=client_id, app_name="Data Stager") cli.login(no_local_server=True, requested_scopes=scopes, refresh_tokens=True, force=args.login) authorizers = cli.get_authorizers() if args.login: sys.exit(0) # Determine source and destination Globus endpoints and directories source_endpoint = args.source hostname = socket.gethostname() if not source_endpoint: source_endpoint = None for h, ep in hostname_endpoint.items(): if hostname.startswith(h): source_endpoint = ep break if not source_endpoint: logger.error("The source Globus endpoint is required") sys.exit(1) try: destination_endpoint, destination_dir = args.destination.split(":", 1) except ValueError: logger.error("Globus destination endpoint and path are incorrect") sys.exit(1) for name, ep in name_endpoint.items(): if destination_endpoint == name: destination_endpoint = ep break # Try to activate source and destination Globus endpoints tc = globus_sdk.TransferClient( authorizer=authorizers["transfer.api.globus.org"]) resp = tc.endpoint_autoactivate(source_endpoint, if_expires_in=36000) if resp["code"] == "AutoActivationFailed": logger.error( "The source endpoint is not active. Please go to https://app.globus.org/file-manager/collections/{} to activate the endpoint." .format(source_endpoint)) sys.exit(1) logger.info("The source Globus endpoint has been activated") resp = tc.endpoint_autoactivate(destination_endpoint, if_expires_in=36000) if resp["code"] == "AutoActivationFailed": logger.error( "The destination endpoint is not active. Please go to https://app.globus.org/file-manager/collections/{} to activate the endpoint." .format(destination_endpoint)) sys.exit(1) logger.info("The destination Globus endpoint has been activated") # Load pattern file if provided global patterns if args.pattern_file: with open(args.pattern_file, "r") as f: patterns = json.load(f) components = [] if args.component: components = args.component.split(",") # Data file patterns file_patterns = [] for c in components: p = patterns.get(c) if isinstance(p, str): file_patterns.append(p) elif isinstance(p, list): file_patterns = file_patterns + p file_patterns = file_patterns + args.files if not file_patterns: file_patterns = ["*"] logger.debug("File patterns: {}".format(file_patterns)) # Restart file patterns p = patterns.get("restart") if isinstance(p, str): restart_patterns = [p] elif isinstance(p, list): restart_patterns = p logger.debug("Restart file patterns: {}".format(restart_patterns)) # Namelist file patterns p = patterns.get("namelist") if isinstance(p, str): namelist_patterns = [p] elif isinstance(p, list): namelist_patterns = p logger.debug("Namelist file patterns: {}".format(namelist_patterns)) # Create temporary directory for all zstash files, etc. tmp_directory = tempfile.mkdtemp(prefix="stager-", dir=".") os.chdir(tmp_directory) # Download and open database logger.info('Opening index database') config.hpss = args.zstash hpss_get(config.hpss, DB_FILENAME) con = sqlite3.connect(DB_FILENAME, detect_types=sqlite3.PARSE_DECLTYPES) cur = con.cursor() # Retrieve some configuration settings from database for attr in dir(config): value = getattr(config, attr) if not callable(value) and not attr.startswith("__"): cur.execute(u"select value from config where arg=?", (attr, )) value = cur.fetchone()[0] setattr(config, attr, value) config.maxsize = int(config.maxsize) config.keep = bool(int(config.keep)) # The command line arg should always have precedence config.keep = True if args.zstash is not None: config.hpss = args.zstash logger.info("Local path: {}".format(config.path)) logger.info("HPSS path: {}".format(config.hpss)) logger.info("Max size: {}".format(config.maxsize)) # Find matching files file_matches = [] for p in file_patterns: cur.execute(u"select * from files where name GLOB ? or tar GLOB ?", (p, p)) file_matches = file_matches + cur.fetchall() restart_matches = [] for p in restart_patterns: cur.execute( u"select * from files where name GLOB ? or tar GLOB ? limit 1", (p, p)) restart_matches = cur.fetchall() if restart_matches: break namelist_matches = [] for p in namelist_patterns: cur.execute( u"select * from files where name GLOB ? or tar GLOB ? limit 1", (p, p)) namelist_matches = cur.fetchall() if namelist_matches: break logger.debug("Matching files: {}".format(file_matches)) logger.debug("Matching restart file: {}".format(restart_matches)) logger.debug("Matching namelist file: {}".format(namelist_matches)) matches = file_matches + restart_matches + namelist_matches # Sort by the filename, tape (so the tar archive), and order within tapes (offset). matches.sort(key=lambda x: (x[1], x[5], x[6])) """ Based off the filenames, keep only the last instance of a file. This is because we may have different versions of the same file across many tars. """ insert_idx, iter_idx = 0, 1 for iter_idx in range(1, len(matches)): # If the filenames are unique, just increment insert_idx. # iter_idx will increment after this iteration. if matches[insert_idx][1] != matches[iter_idx][1]: insert_idx += 1 # Always copy over the value at the correct location. matches[insert_idx] = matches[iter_idx] matches = matches[:insert_idx + 1] logger.info( "{} matching files including restart and namelist files".format( len(matches))) # Sort by tape and offset, so that we make sure that extract the files by tape order. matches.sort(key=lambda x: (x[5], x[6])) # Retrieve from tapes if args.workers > 1: logger.debug("Running zstash with multiprocessing") failures = multiprocess_extract(args.workers, matches, True) else: failures = extractFiles(matches, True) # Close database logger.debug('Closing index database') con.close() if failures: logger.error("Encountered an error for files:") for fail in failures: logger.error("{} in {}".format(fail[1], fail[5])) broken_tars = sorted(set([f[5] for f in failures])) logger.error("The following tar archives had errors:") for tar in broken_tars: logger.error(tar) sys.exit(1) # Create a manifest file manifest = [] for m in matches: manifest.append({"filename": m[1], "length": m[2], "md5": m[4]}) if args.m: manifest_name = args.m + "-" manifest_name += "manifest.json" with open(manifest_name, "w+") as f: f.write(json.dumps(manifest)) # Transfer the files downloaded from the zstash archive if args.t: label = args.t else: label = "E3SM Data Stager on {}".format(hostname) td = globus_sdk.TransferData(tc, source_endpoint, destination_endpoint, label=label) cwd = os.getcwd() source_path = os.path.join(cwd, manifest_name) destination_path = os.path.join(destination_dir, manifest_name) td.add_item(source_path, destination_path) for m in matches: source_path = os.path.join(cwd, m[1]) destination_path = os.path.join(destination_dir, m[1]) td.add_item(source_path, destination_path) try: task = tc.submit_transfer(td) task_id = task.get("task_id") logger.info("Submitted Globus transfer: {}".format(task_id)) except Exception as e: logger.error("Globus transfer failed due to error: {}".format(e)) sys.exit(1) if not args.block: logger.info( "You can monitor the status of the transfer at https://app.globus.org/activity/{}" .format(task_id)) sys.exit(0) """ A Globus transfer job (task) can be in one of the three states: ACTIVE, SUCCEEDED, FAILED. The Data Stager polls a status of the transfer job (task) from the Globus Transfer service every 15 seconds with 60 second timeout limit. If the task is ACTIVE after time runs out, 'tc.task_wait()' returns False, and True otherwise. """ last_event_time = None while not tc.task_wait(task_id, 60, 15): task = tc.get_task(task_id) # Get the last error Globus event events = tc.task_event_list(task_id, num_results=1, filter="is_error:1") try: event = next(events) except StopIteration: continue # Log the error event if it was not yet logged if event["time"] != last_event_time: last_event_time = event["time"] logger.warn( "Non-critical Globus Transfer error event: {} at {}".format( event["description"], event["time"])) logger.warn("Globus Transfer error details: {}".format( event["details"])) """ The Globus transfer job (task) has been terminated (is not ACTIVE). Check if the transfer SUCCEEDED or FAILED. """ task = tc.get_task(task_id) if task["status"] == "SUCCEEDED": logger.info("Globus transfer {} succeeded".format(task_id)) else: logger.error("Globus Transfer task: {}".format(task_id)) events = tc.task_event_list(task_id, num_results=1, filter="is_error:1") event = next(events) logger.error("Globus transfer {} failed due to error: {}".format( task_id, event["details"])) sys.exit(1) if args.e: logger.info("Deleting downloaded zstash archives and extracted files") os.chdir("..") shutil.rmtree(tmp_directory)
native_auth_client.oauth2_start_flow() print("Login Here:\n\n{0}".format( native_auth_client.oauth2_get_authorize_url())) # Authorization code auth_code = str(input("Input auth code:")) # Create transfer client token_response = native_auth_client.oauth2_exchange_code_for_tokens( auth_code) transfer_access_token = token_response.by_resource_server[ 'transfer.api.globus.org']['access_token'] transfer_authorizer = globus_sdk.AccessTokenAuthorizer( transfer_access_token) transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer) deep_blue_crawl_df = pd.read_csv(args.crawl_csv) file_uuid_mapping = dict() for index, row in deep_blue_crawl_df.iterrows(): file_uuid_mapping[row[0]] = row[4] # Filter files filtered_files = deep_blue_crawl_df[ deep_blue_crawl_df.file_uuid.str.endswith( args.compression_extension)].sort_values(by=["size_bytes"]) max_size_threshold = args.max_transfer_size # Just to make sure we don't blow up the Jetstream instance transferred_files = [] batch_n = 1
def submission_driver(metadata, sub_conf, source_id, access_token, user_id): """The driver function for MOC. Modifies the status database as steps are completed. Arguments: metadata (dict): The JSON passed to /submit. sub_conf (dict): Submission configuration information. source_id (str): The source name of this submission. access_token (str): The Globus Auth access token for the submitting user. user_id (str): The Globus ID of the submitting user. """ # Setup utils.update_status(source_id, "sub_start", "P", except_on_fail=True) utils.modify_status_entry(source_id, { "pid": os.getpid(), "hibernating": False }, except_on_fail=True) try: # Connect auth # CAAC required for user auth later mdf_conf_client = globus_sdk.ConfidentialAppAuthClient( CONFIG["API_CLIENT_ID"], CONFIG["API_CLIENT_SECRET"]) mdf_creds = mdf_toolbox.dict_merge(CONFIG["GLOBUS_CREDS"], {"services": ["transfer"]}) mdf_clients = mdf_toolbox.confidential_login(**mdf_creds) mdf_transfer_client = mdf_clients["transfer"] # User auth # When coming from curation, the access token (from the curator) is not used access_token = access_token.replace("Bearer ", "") dependent_grant = mdf_conf_client.oauth2_get_dependent_tokens( access_token) # Get specifically Transfer's access token for grant in dependent_grant.data: if grant["resource_server"] == "transfer.api.globus.org": user_transfer_token = grant["access_token"] user_transfer_authorizer = globus_sdk.AccessTokenAuthorizer( user_transfer_token) user_transfer_client = globus_sdk.TransferClient( authorizer=user_transfer_authorizer) except Exception as e: utils.update_status(source_id, "sub_start", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return # Cancel the previous version(s) source_info = utils.split_source_id(source_id) scan_res = utils.scan_table(table_name="status", fields=["source_id", "active"], filters=[("source_id", "^", source_info["source_name"]), ("source_id", "<", source_id)]) if not scan_res["success"]: utils.update_status(source_id, "sub_start", "F", text=scan_res["error"], except_on_fail=True) utils.complete_submission(source_id) return old_source_ids = [ oldsub["source_id"] for oldsub in scan_res["results"] if oldsub["active"] ] if old_source_ids: utils.update_status( source_id, "sub_start", "M", text=("The following submissions will be cancelled: {}".format( old_source_ids)), except_on_fail=True) utils.update_status(source_id, "old_cancel", "P", except_on_fail=True) for old_source_id in old_source_ids: cancel_res = utils.cancel_submission(old_source_id, wait=True) if not cancel_res["stopped"]: utils.update_status( source_id, "sub_start", "F", text=cancel_res.get( "error", ("Unable to cancel previous " "submission '{}'").format(old_source_id)), except_on_fail=True) utils.complete_submission(source_id) return if cancel_res["success"]: logger.info("{}: Cancelled source_id {}".format( source_id, old_source_id)) else: logger.debug("{}: Stopped source_id {}".format( source_id, old_source_id)) utils.update_status(source_id, "old_cancel", "S", except_on_fail=True) else: utils.update_status(source_id, "sub_start", "S", except_on_fail=True) utils.update_status(source_id, "old_cancel", "N", except_on_fail=True) # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return local_path = os.path.join(CONFIG["LOCAL_PATH"], source_id) + "/" feedstock_file = os.path.join(CONFIG["FEEDSTOCK_PATH"], source_id + ".json") curation_state_file = os.path.join(CONFIG["CURATION_DATA"], source_id + ".json") service_data = os.path.join(CONFIG["SERVICE_DATA"], source_id) + "/" os.makedirs(service_data, exist_ok=True) num_files = 0 # Curation skip point if type(sub_conf["curation"]) is not str: # If we're extracting, download data locally, then set canon source to local # This allows non-Globus sources (because to download to Connect's EP) if not sub_conf["no_extract"]: utils.update_status(source_id, "data_download", "P", except_on_fail=True) try: # Download from user for dl_res in utils.download_data( user_transfer_client, sub_conf["data_sources"], CONFIG["LOCAL_EP"], local_path, admin_client=mdf_transfer_client, user_id=user_id): if not dl_res["success"]: msg = "During data download: " + dl_res["error"] utils.update_status(source_id, "data_download", "T", text=msg, except_on_fail=True) if not dl_res["success"]: raise ValueError(dl_res["error"]) num_files = dl_res["total_files"] except Exception as e: utils.update_status(source_id, "data_download", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return utils.update_status( source_id, "data_download", "M", text=( "{} files will be grouped and extracted (from {} archives)" .format(num_files, dl_res["num_extracted"])), except_on_fail=True) canon_data_sources = [ "globus://{}{}".format(CONFIG["LOCAL_EP"], local_path) ] # If we're not extracting, set canon source to only source # Also create local dir with no data to "extract" for dataset entry else: utils.update_status(source_id, "data_download", "N", except_on_fail=True) os.makedirs(local_path) canon_data_sources = sub_conf["data_sources"] # Move data from canon source(s) to canon dest (if different) utils.update_status(source_id, "data_transfer", "P", except_on_fail=True) # If not extracting, set up user TC for backup use if sub_conf["no_extract"]: backup_user_id = user_id backup_user_client = user_transfer_client else: backup_user_id = None backup_user_client = None for data_source in canon_data_sources: if data_source != sub_conf["canon_destination"]: logger.debug("Data transfer: '{}' to '{}'".format( data_source, sub_conf["canon_destination"])) try: for backup_res in utils.backup_data( mdf_transfer_client, data_source, sub_conf["canon_destination"], acl=sub_conf["storage_acl"], data_client=backup_user_client, data_user=backup_user_id): if not backup_res["success"]: msg = ("During data download: {}".format( backup_res.get("error", "Unknown error"))) utils.update_status(source_id, "data_transfer", "T", text=msg, except_on_fail=True) if not backup_res["success"]: raise ValueError(backup_res.get("error")) elif not backup_res[ sub_conf["canon_destination"]]["success"]: raise ValueError( backup_res[sub_conf["canon_destination"]]["error"]) except Exception as e: err_text = ( "Transfer from '{}' to primary/canon destination '{}' failed: {}" .format(data_source, sub_conf["canon_destination"], str(e))) utils.update_status(source_id, "data_transfer", "F", text=err_text, except_on_fail=True) return utils.update_status(source_id, "data_transfer", "S", except_on_fail=True) # Add file info data sub_conf["index"]["file"] = { "globus_host": sub_conf["canon_destination"], "http_host": utils.lookup_http_host(sub_conf["canon_destination"]), "local_path": local_path, } extract_params = { "dataset": metadata, "extractors": sub_conf["index"], "service_data": service_data, "feedstock_file": feedstock_file, "group_config": mdf_toolbox.dict_merge(sub_conf["extraction_config"], CONFIG["GROUPING_RULES"]), "validation_info": { "project_blocks": sub_conf.get("project_blocks", []), "required_fields": sub_conf.get("required_fields", []), "allowed_nulls": CONFIG["SCHEMA_NULLS"], "base_acl": sub_conf["acl"] } } # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return # Extract data utils.update_status(source_id, "extracting", "P", except_on_fail=True) try: extract_res = start_extractors(local_path, extract_params) if not extract_res["success"]: utils.update_status(source_id, "extracting", "F", text=extract_res["error"], except_on_fail=True) return dataset = extract_res["dataset"] num_records = extract_res["num_records"] num_groups = extract_res["num_groups"] extensions = extract_res["extensions"] except Exception as e: utils.update_status(source_id, "extracting", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return else: utils.modify_status_entry(source_id, {"extensions": extensions}) # If nothing in dataset, panic if not dataset: utils.update_status(source_id, "extracting", "F", text="Could not process dataset entry", except_on_fail=True) utils.complete_submission(source_id) return # If not extracting, show status as skipped # Also check if records were extracted inappropriately, flag error in log elif sub_conf.get("no_extract"): if num_records != 0: logger.error( "{}: Records extracted with no_extract flag ({} records)" .format(source_id, num_records)) utils.update_status(source_id, "extracting", "N", except_on_fail=True) else: utils.update_status( source_id, "extracting", "M", text=("{} metadata records extracted out of {} file groups" .format(num_records, num_groups)), except_on_fail=True) logger.debug("{}: {} entries extracted".format( source_id, num_records + 1)) # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return ################### # Curation step # ################### # Trigger curation if required if sub_conf.get("curation"): utils.update_status(source_id, "curation", "P", except_on_fail=True) # Create curation task in curation table with open(feedstock_file) as f: # Discard dataset entry f.readline() # Save first few records # Append the json-loaded form of records # The number of records should be at most the default number, # and less if less are present curation_records = [] [ curation_records.append(json.loads(f.readline())) for i in range( min(CONFIG["NUM_CURATION_RECORDS"], num_records)) ] curation_dataset = deepcopy(dataset) # Numbers can be extracted into Decimal by DynamoDB, which causes JSON errors curation_dataset["mdf"].pop("scroll_id", None) curation_dataset["mdf"].pop("version", None) curation_task = { "source_id": source_id, "allowed_curators": sub_conf.get("permission_groups", sub_conf["acl"]), "dataset": json.dumps(dataset), "sample_records": json.dumps(curation_records), "submission_info": sub_conf, "extraction_summary": ("{} records were extracted out of {} groups from {} files". format(num_records, num_groups, num_files)), "curation_start_date": str(datetime.today()) } # If no allowed curators or public allowed, set to public if (not curation_task["allowed_curators"] or "public" in curation_task["allowed_curators"]): curation_task["allowed_curators"] = ["public"] # Create task in database create_res = utils.create_curation_task(curation_task) if not create_res["success"]: utils.update_status(source_id, "curation", "F", text=create_res.get( "error", "Unable to create curation task"), except_on_fail=True) return # Save state os.makedirs(CONFIG["CURATION_DATA"], exist_ok=True) with open(curation_state_file, 'w') as save_file: state_data = { "source_id": source_id, "sub_conf": sub_conf, "dataset": dataset } json.dump(state_data, save_file) logger.debug("{}: Saved state for curation".format(source_id)) # Trigger hibernation utils.modify_status_entry(source_id, {"hibernating": True}, except_on_fail=True) return else: utils.update_status(source_id, "curation", "N", except_on_fail=True) # Returning from curation # Submission accepted elif sub_conf["curation"].startswith("Accept"): # Save curation message curation_message = sub_conf["curation"] # Load state with open(curation_state_file) as save_file: state_data = json.load(save_file) # Verify source_ids match if state_data["source_id"] != source_id: logger.error("State data incorrect: '{}' is not '{}'".format( state_data["source_id"], source_id)) utils.update_status(source_id, "curation", "F", text="Submission corrupted", except_on_fail=True) return # Load state variables back sub_conf = state_data["sub_conf"] dataset = state_data["dataset"] logger.debug("{}: Loaded state from curation".format(source_id)) # Delete state file try: os.remove(curation_state_file) except FileNotFoundError: utils.update_status( source_id, "curation", "F", text="Unable to cleanly load curation information", except_on_fail=True) return # Delete curation task delete_res = utils.delete_from_table("curation", source_id) if not delete_res["success"]: utils.update_status(source_id, "curation", "F", text=delete_res.get("error", "Curation cleanup failed"), except_on_fail=True) return utils.update_status(source_id, "curation", "M", text=curation_message, except_on_fail=True) # Submission rejected elif sub_conf["curation"].startswith("Reject"): # Delete state file try: os.remove(curation_state_file) except FileNotFoundError: logger.error( "{}: Unable to delete curation state file '{}'".format( source_id, curation_state_file)) # Delete curation task delete_res = utils.delete_from_table("curation", source_id) if not delete_res["success"]: logger.error( "{}: Unable to delete rejected curation from database: {}". format(source_id, delete_res.get("error"))) utils.update_status(source_id, "curation", "F", text=sub_conf["curation"], except_on_fail=True) return # Curation invalid else: utils.update_status(source_id, "curation", "F", text="Unknown curation state: '{}'".format( sub_conf["curation"]), except_on_fail=True) return ################### # Post-curation # ################### # Integrations service_res = {} # NOTE: Cancellation point if utils.read_table("status", source_id).get("status", {}).get("cancelled"): logger.debug("{}: Cancel signal acknowledged".format(source_id)) utils.complete_submission(source_id) return # MDF Search (mandatory) utils.update_status(source_id, "ingest_search", "P", except_on_fail=True) search_config = sub_conf["services"].get("mdf_search", {}) try: search_args = { "feedstock_file": feedstock_file, "source_id": source_id, "index": search_config.get("index", CONFIG["INGEST_INDEX"]), "delete_existing": True, "batch_size": CONFIG["SEARCH_BATCH_SIZE"] } search_res = utils.search_ingest(**search_args) if not search_res["success"]: utils.update_status(source_id, "ingest_search", "F", text="; ".join(search_res["errors"]), except_on_fail=True) return except Exception as e: utils.update_status(source_id, "ingest_search", "F", text=repr(e), except_on_fail=True) utils.complete_submission(source_id) return else: # Handle errors if len(search_res["errors"]) > 0: utils.update_status( source_id, "ingest_search", "F", text=( "{} batches of records failed to ingest (up to {} records " "total)").format(len(search_res["errors"]), (len(search_res["errors"]) * CONFIG["SEARCH_BATCH_SIZE"])), except_on_fail=True) utils.complete_submission(source_id) return utils.update_status(source_id, "ingest_search", "S", except_on_fail=True) os.remove(feedstock_file) service_res["mdf_search"] = "This dataset was ingested to MDF Search." # Move files to data_destinations if sub_conf.get("data_destinations"): utils.update_status(source_id, "ingest_backup", "P", except_on_fail=True) try: for backup_res in utils.backup_data( mdf_transfer_client, storage_loc=sub_conf["canon_destination"], backup_locs=sub_conf["data_destinations"], acl=sub_conf["storage_acl"]): if not backup_res["success"]: msg = "During data backup: " + backup_res.get( "error", "Unknown error") utils.update_status(source_id, "ingest_backup", "T", text=msg, except_on_fail=True) if not backup_res["success"]: raise ValueError(backup_res.get("error")) except Exception as e: err_msg = "Destination backup failed: {}".format(str(e)) utils.update_status(source_id, "ingest_backup", "F", text=err_msg, except_on_fail=True) return # On any complete failure, fail submission if not all([val["success"] is True for val in backup_res.values()]): err_msg = "; ".join([ "'{}' failed: {}".format(k, v["error"]) for k, v in backup_res.items() if v["success"] is not True ]) utils.update_status(source_id, "ingest_backup", "F", text=err_msg, except_on_fail=True) return # On an error with a successful Transfer, notify user but continue elif not all([val["error"] == "" for val in backup_res.values()]): err_msg = "; ".join([ "on '{}': {}".format(k, v["error"]) for k, v in backup_res.items() if v["error"] ]) utils.update_status(source_id, "ingest_backup", "R", text=err_msg, except_on_fail=True) else: utils.update_status(source_id, "ingest_backup", "S", except_on_fail=True) else: utils.update_status(source_id, "ingest_backup", "N", except_on_fail=True) # MDF Publish if sub_conf["services"].get("mdf_publish"): publish_conf = sub_conf["services"]["mdf_publish"] # Data already moved to canon dest as a requirement of success so far # Mint DOI try: # Create DOI and add to dataset DC dataset["dc"]["identifier"] = { "identifier": utils.make_dc_doi(test=publish_conf["doi_test"]), "identifierType": "DOI" } # Add publication dates and publisher dataset["dc"]["publisher"] = "Materials Data Facility" dataset["dc"]["publicationYear"] = datetime.now().year if not dataset["dc"].get("dates"): dataset["dc"]["dates"] = [] dataset["dc"]["dates"].append({ "date": str(datetime.now().date()), "dateType": "Accepted" }) landing_page = CONFIG["DATASET_LANDING_PAGE"].format(source_id) mdf_publish_res = utils.datacite_mint_doi( dataset["dc"], test=publish_conf["doi_test"], url=landing_page) except Exception as e: logger.error("DOI minting exception: {}".format(repr(e))) utils.update_status(source_id, "ingest_publish", "F", text="DOI minting failed", except_on_fail=True) return else: if not mdf_publish_res["success"]: logger.error("DOI minting failed: {}".format( mdf_publish_res["error"])) utils.update_status(source_id, "ingest_publish", "F", text="Unable to mint DOI for publication", except_on_fail=True) return utils.update_status( source_id, "ingest_publish", "L", text=("Dataset published though MDF Publish with DOI '{}'".format( dataset["dc"]["identifier"]["identifier"])), link=landing_page, except_on_fail=True) service_res["mdf_publish"] = landing_page else: utils.update_status(source_id, "ingest_publish", "N", except_on_fail=True) # Citrine (skip if not extracted) if sub_conf["services"].get("citrine") and not sub_conf.get("no_extract"): utils.update_status(source_id, "ingest_citrine", "P", except_on_fail=True) # Get old Citrine dataset version, if exists scan_res = utils.scan_table(table_name="status", fields=["source_id", "citrine_id"], filters=[("source_name", "==", source_info["source_name"]), ("citrine_id", "!=", None)]) if not scan_res["success"]: logger.error("Status scan failed: {}".format(scan_res["error"])) old_cit_subs = scan_res.get("results", []) if len(old_cit_subs) == 0: old_citrine_id = None elif len(old_cit_subs) == 1: old_citrine_id = old_cit_subs[0]["citrine_id"] else: old_citrine_id = max([sub["citrine_id"] for sub in old_cit_subs]) try: # Check for PIFs to ingest cit_path = os.path.join(service_data, "citrine") if len(os.listdir(cit_path)) > 0: cit_res = utils.citrine_upload( cit_path, CONFIG["CITRINATION_API_KEY"], dataset, old_citrine_id, public=sub_conf["services"]["citrine"].get("public", True)) else: cit_res = { "success": False, "error": "No PIFs were generated from this dataset", "success_count": 0, "failure_count": 0 } except Exception as e: utils.update_status(source_id, "ingest_citrine", "R", text=str(e), except_on_fail=True) else: if not cit_res["success"]: if cit_res.get("error"): text = cit_res["error"] elif cit_res.get("failure_count"): text = "All {} PIFs failed to upload".format( cit_res["failure_count"]) elif cit_res.get("failure_count") == 0: text = "No PIFs were found" logger.warning("{}: PIFs not found!".format(source_id)) else: text = "An error prevented PIF uploading" utils.update_status(source_id, "ingest_citrine", "R", text=text, except_on_fail=True) else: text = "{}/{} PIFs uploaded successfully".format( cit_res["success_count"], cit_res["success_count"] + cit_res["failure_count"]) link = CONFIG["CITRINATION_LINK"].format( cit_ds_id=cit_res["cit_ds_id"]) utils.update_status(source_id, "ingest_citrine", "L", text=text, link=link, except_on_fail=True) stat_res_2 = utils.modify_status_entry( source_id, {"citrine_id": cit_res["cit_ds_id"]}) if not stat_res_2["success"]: raise ValueError(str(stat_res_2)) service_res["citrine"] = link else: utils.update_status(source_id, "ingest_citrine", "N", except_on_fail=True) # MRR if sub_conf["services"].get("mrr"): utils.update_status(source_id, "ingest_mrr", "P", except_on_fail=True) try: if (isinstance(sub_conf["services"]["mrr"], dict) and sub_conf["services"]["mrr"].get("test")): mrr_title = "TEST_" + dataset["dc"]["titles"][0]["title"] else: mrr_title = dataset["dc"]["titles"][0]["title"] mrr_contributors = "" for author in dataset["dc"]["creators"]: mrr_contributors += CONFIG["MRR_CONTRIBUTOR"].format( name=(author.get("givenName", "") + " " + author.get("familyName", "")), affiliation=author.get("affiliation", "")) mrr_description = "" for desc in dataset["dc"].get("descriptions", []): mrr_description += desc["description"] + " " # Must add at least one subject to MRR entry mrr_subjects = "<subject>MDF Dataset</subject>" for subj in dataset["dc"].get("subjects", []): mrr_subjects += "<subject>" + subj["subject"] + "</subject>" mrr_entry = { "title": dataset["dc"]["titles"][0]["title"], "template": CONFIG["MRR_SCHEMA"], "xml_content": CONFIG["MRR_TEMPLATE"].format( title=mrr_title, publisher=dataset["dc"]["publisher"], contributors=mrr_contributors, contact_name=dataset["dc"]["creators"][0]["creatorName"], description=mrr_description, subjects=mrr_subjects, landing_page=CONFIG["DATASET_LANDING_PAGE"].format( source_id)) } except Exception as e: utils.update_status(source_id, "ingest_mrr", "R", text="Unable to create MRR metadata:" + repr(e), except_on_fail=True) else: try: mrr_res_raw = requests.post(CONFIG["MRR_URL"], auth=(CONFIG["MRR_USERNAME"], CONFIG["MRR_PASSWORD"]), data=mrr_entry) try: mrr_res = mrr_res_raw.json() except json.JSONDecodeError: raise ValueError("Invalid MRR response: {}".format( mrr_res_raw.content)) if mrr_res_raw.status_code not in [201, 202]: raise ValueError( "MRR ingest failed with error code {}: '{}'".format( mrr_res_raw.status_code, mrr_res)) except Exception as e: utils.update_status(source_id, "ingest_mrr", "R", text="Unable to submit MRR entry: " + repr(e), except_on_fail=True) else: try: mrr_id = mrr_res.get("id") if not mrr_id: raise ValueError("MRR entry has no ID") except Exception: utils.update_status(source_id, "ingest_mrr", "R", text=mrr_res.get( "message", "Unknown MRR failure"), except_on_fail=True) else: text = "Dataset successfully registered with the MRR" mrr_link = CONFIG["MRR_LINK"].format(mrr_id) utils.update_status(source_id, "ingest_mrr", "L", text=text, link=mrr_link, except_on_fail=True) service_res["mrr"] = mrr_link else: utils.update_status(source_id, "ingest_mrr", "N", except_on_fail=True) # Dataset update, start cleanup utils.update_status(source_id, "ingest_cleanup", "P", except_on_fail=True) dataset["services"] = service_res ds_update = utils.update_search_entries(search_config.get( "index", CONFIG["INGEST_INDEX"]), entries=[dataset], overwrite=False) if not ds_update["success"]: utils.update_status(source_id, "ingest_cleanup", "F", text=ds_update.get("error", "Unable to update dataset"), except_on_fail=True) utils.complete_submission(source_id) return # Cleanup try: fin_res = utils.complete_submission(source_id, cleanup=CONFIG["FINAL_CLEANUP"]) except Exception as e: utils.update_status(source_id, "ingest_cleanup", "F", text=repr(e), except_on_fail=True) return if not fin_res["success"]: utils.update_status(source_id, "ingest_cleanup", "F", text=fin_res["error"], except_on_fail=True) return utils.update_status(source_id, "ingest_cleanup", "S", except_on_fail=True) logger.debug("{}: Ingest complete".format(source_id)) return {"success": True, "source_id": source_id}
def getTokens(): tokens = None try: # if we already have tokens, load and use them tokens = load_tokens_from_file(p.opt["globusTokenFile"]) except: pass if not tokens: # if we need to get tokens, start the Native App authentication process tokens = do_native_app_authentication(CLIENT_ID, REDIRECT_URI, SCOPES) try: save_tokens_to_file(p.opt["globusTokenFile"], tokens) except: pass transfer_tokens = tokens['transfer.api.globus.org'] auth_client = globus_sdk.NativeAppAuthClient(client_id=CLIENT_ID) authorizer = globus_sdk.RefreshTokenAuthorizer( transfer_tokens['refresh_token'], auth_client, access_token=transfer_tokens['access_token'], expires_at=transfer_tokens['expires_at_seconds'], on_refresh=update_tokens_file_on_refresh) transfer = globus_sdk.TransferClient(authorizer=authorizer) myproxy_lifetime = 720 #in hours. What's the maximum? try: r = transfer.endpoint_autoactivate(p.opt["archiveEndPoint"], if_expires_in=3600) while (r["code"] == "AutoActivationFailed"): print( "Endpoint requires manual activation, please use your UCAS name/password for this activation. " "You can activate via the command line or via web browser:\n" "WEB BROWSER -- Open the following URL in a browser to activate the " "endpoint:") print( f"https://app.globus.org/file-manager?origin_id={p.opt['archiveEndPoint']}" ) print("CMD LINE -- run this from your shell: ") print( f"globus endpoint activate --myproxy --myproxy-lifetime {myproxy_lifetime} {p.opt['archiveEndPoint']}" ) input("Press ENTER after activating the endpoint:") r = tc.endpoint_autoactivate(ep_id, if_expires_in=3600) except globus_sdk.exc.GlobusAPIError as ex: print("endpoint_autoactivation failed.") print(ex) if ex.http_status == 401: sys.exit('Refresh token has expired. ' 'Please delete refresh-tokens.json and try again.') else: raise ex # print out a directory listing from an endpoint #print("Looking at archive end point") #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'): # print(entry['name'] + ('/' if entry['type'] == 'dir' else '')) # revoke the access token that was just used to make requests against # the Transfer API to demonstrate that the RefreshTokenAuthorizer will # automatically get a new one #auth_client.oauth2_revoke_token(authorizer.access_token) # Allow a little bit of time for the token revocation to settle #time.sleep(1) # Verify that the access token is no longer valid #token_status = auth_client.oauth2_validate_token( # transfer_tokens['access_token']) #assert token_status['active'] is False, 'Token was expected to be invalid.' #print('\nDoing a second directory listing with a new access token:') #for entry in transfer.operation_ls(p.opt["archiveEndPoint"], path='/~/'): # print(entry['name'] + ('/' if entry['type'] == 'dir' else '')) local_ep = globus_sdk.LocalGlobusConnectPersonal() local_ep_id = local_ep.endpoint_id #print("Looking at local end point") #for entry in transfer.operation_ls(local_ep_id): # print(f"Local file: {entry['name']}") logging.info("BEGINNING PROCESSING OF archiveItems") for item, item_info in p.opt["archiveItems"].items(): logging.info(f"Transferring {item}") if not item_info["source"].startswith('/'): logging.error( f"{item} source: {item_info['source']} must be absolute. SKIPPING!" ) continue if not item_info["destination"].startswith('/'): logging.error( f"{item} source: {item_info['destination']} must be absolute. SKIPPING!" ) continue try: transfer.operation_ls(p.opt["archiveEndPoint"], path=item_info["destination"]) except globus_sdk.exc.TransferAPIError as e: logging.fatal( f"Destination path ({item_info['destination']}) does not exist on archiveEndPoint." ) logging.fatal(e) sys.exit(1) # get leaf dir from source, and add it to destination dirname, leaf = os.path.split(item_info['source']) if leaf == '': _, leaf = os.path.split(dirname) destination_directory = os.path.join(item_info['destination'], leaf) + '/' # Check if destination_dir already exists, and skip if so # TODO: add support to overwrite? try: transfer.operation_ls(p.opt["archiveEndPoint"], path=destination_directory) logging.error( f"Destination {destination_directory} already exists on archiveEndPoint. SKIPPING!" ) continue except globus_sdk.exc.TransferAPIError as e: if e.code != u'ClientError.NotFound': logging.fatal( f"Can't ls {p.opt['archiveEndPoint']} : {destination_directory}" ) logging.fatal(e) sys.exit(1) # create destination directory try: logging.info( f"Creating destination directory {destination_directory}") transfer.operation_mkdir(p.opt["archiveEndPoint"], destination_directory) except globus_sdk.exc.TransferAPIError as e: logging.fatal( f"Can't mkdir {p.opt['archiveEndPoint']} : {destination_directory}" ) logging.fatal(e) sys.exit(1) # TODO: set permissions for users to read dir # look at https://github.com/globus/automation-examples/blob/master/share_data.py #tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"], label=item_info["transfer-label"]) tdata = globus_sdk.TransferData(transfer, local_ep_id, p.opt["archiveEndPoint"]) tdata.add_item(item_info["source"], destination_directory, recursive=True) try: logging.info( f"Submitting transfer task - {item_info['transfer-label']}") task = transfer.submit_transfer(tdata) except globus_sdk.exc.TransferAPIError as e: logging.fatal("Transfer task submission failed") logging.fatal(e) sys.exit(1) logging.info(f"Task ID: {task['task_id']}") logging.info( f"This task can be monitored via the Web UI: https://app.globus.org/activity/{task['task_id']}" )