def get_file(url, output_path, auth_config, token=None, dest_endpoint=None): try: src_endpoint = urlsplit(url).hostname src_path = urlsplit(url).path if platform.system() == "Windows": dest_path = ''.join(('/', output_path.replace('\\', '/').replace(':', ''))) else: dest_path = os.path.abspath(output_path) if not token: token, dest_endpoint = authenticate(url, auth_config) if token is None: logger.warn("A valid Globus access token is required to create transfers. " "Check keychain.json for valid parameters.") return False if dest_endpoint is None: logger.warn("A valid Globus destination endpoint must be specified. " "Check keychain.json for valid parameters.") return False # initialize transfer client authorizer = globus_sdk.AccessTokenAuthorizer(token) client = globus_sdk.TransferClient(authorizer=authorizer) # Activate source endpoint logger.debug("Activating source endpoint: %s" % src_endpoint) data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600) # Activate destination endpoint logger.debug("Activating destination endpoint: %s" % dest_endpoint) data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600) filename = src_path.rsplit('/', 1)[-1] label = "".join(("BDBag Fetch -- ", filename.replace('.', '_'))) # get a unique ID for this transfer tdata = globus_sdk.TransferData(client, src_endpoint, dest_endpoint, label=label) tdata.add_item(src_path, dest_path, recursive=False) # start the transfer data = client.submit_transfer(tdata) task_id = data["task_id"] logger.info("Globus transfer started with ID %s" % task_id) logger.debug("Transferring file %s to %s" % (url, output_path)) return True except Exception as e: logger.error('Globus transfer request exception: %s' % get_typed_exception(e)) return False
def fetch_file(url, path, auth, **kwargs): scheme = urlsplit(url).scheme.lower() if SCHEME_HTTP == scheme or SCHEME_HTTPS == scheme: return fetch_http.get_file(url, path, auth, **kwargs) if SCHEME_FTP == scheme: return fetch_ftp.get_file(url, path, auth, **kwargs) if SCHEME_S3 == scheme or SCHEME_GS == scheme: return fetch_boto3.get_file(url, path, auth, **kwargs) if SCHEME_GLOBUS == scheme: return fetch_globus.get_file(url, path, auth, **kwargs) if SCHEME_TAG == scheme: # pragma: no cover logger.info("The fetch entry for file %s specifies the tag URI %s. Tag URIs may represent objects that " "cannot be directly resolved as network resources and therefore cannot be automatically fetched. " "Such files must be acquired outside of the context of this software." % (path, url)) return path # if we get here, assume the url is an identifier and try to resolve it config = kwargs.get("config") resolver_config = config.get(RESOLVER_CONFIG_TAG, DEFAULT_RESOLVER_CONFIG) if config else DEFAULT_RESOLVER_CONFIG supported_resolvers = resolver_config.keys() if scheme in supported_resolvers: for entry in resolve(url, resolver_config): url = entry.get("url") if url: output_path = fetch_file(url, path, auth, **kwargs) if output_path: return output_path return None logger.warning(UNIMPLEMENTED % scheme) return None
def get_file(url, output_path, auth_config, **kwargs): try: credentials = kwargs.get("credentials") if not credentials: credentials = get_credentials(url, auth_config) output_path = ensure_valid_output_path(url, output_path) logger.info("Attempting FTP retrieve from URL: %s" % url) creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1] or "*****@*****.**") url_parts = urlsplit(url) full_url = urlunsplit( (url_parts.scheme, "%s%s" % (creds, url_parts.netloc), url_parts.path, url_parts.query, url_parts.fragment)) start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) urlretrieve(full_url, output_path) elapsed = datetime.datetime.now() - start total = os.path.getsize(output_path) summary = get_transfer_summary(total, elapsed) logger.info('File [%s] transfer successful. %s' % (output_path, summary)) return output_path except Exception as e: logger.error('FTP Request Exception: %s' % (get_typed_exception(e))) logger.warning('File transfer failed: [%s]' % output_path) return None
def resolve(self, identifier, headers=None): if identifier is None: return [] if stob(self.args.get("simple", False)): urls = list() for identifier_resolver in self.identifier_resolvers: urls.append({"url": self.get_resolver_url(identifier, identifier_resolver)}) return urls session = requests.session() if headers: session.headers = headers for resolver in self.identifier_resolvers: resolver_url = self.get_resolver_url(identifier, resolver) logger.info("Attempting to resolve %s into a valid set of URLs." % identifier) r = session.get(resolver_url) if r.status_code != 200: logger.error('HTTP GET Failed for %s with code: %s' % (r.url, r.status_code)) logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text)) continue else: urls = self.handle_response(r) if urls: logger.info( "The identifier %s resolved into the following locations: [%s]" % (identifier, ', '.join([url["url"] for url in urls]))) else: logger.warning("No file locations were found for identifier %s" % identifier) return urls
def get_file(url, output_path, auth_config, credentials=None): try: if not credentials: credentials = get_credentials(url, auth_config) output_dir = os.path.dirname(os.path.abspath(output_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Attempting FTP retrieve from URL: %s" % url) creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1] or "*****@*****.**") url_parts = urlsplit(url) full_url = urlunsplit( (url_parts.scheme, "%s%s" % (creds, url_parts.netloc), url_parts.path, url_parts.query, url_parts.fragment)) start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) urlretrieve(full_url, output_path) elapsed = datetime.datetime.now() - start total = os.path.getsize(output_path) totalSecs = elapsed.total_seconds() totalMBs = float(total) / float((1024 * 1024)) throughput = str("%.3f MB/second" % (totalMBs / totalSecs if totalSecs > 0 else 0.001)) logger.info( 'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. ' % (output_path, totalMBs, throughput, elapsed)) return True except Exception as e: logger.error('FTP Request Exception: %s' % (get_typed_exception(e))) logger.warning('File transfer failed: [%s]' % output_path) return False
def resolve(identifier, resolvers=DEFAULT_ID_RESOLVERS): urls = [] if identifier is None: return urls for resolver in resolvers: resolver_scheme = "http://" if not (resolver.startswith("http://") or resolver.startswith("https://")) else '' resolver_url = ''.join((resolver_scheme, resolver, '/', identifier)) logger.info("Attempting to resolve %s into a valid set of URLs." % identifier) r = requests.get(resolver_url, headers={'accept': 'application/json', 'Connection': 'keep-alive'}) if r.status_code != 200: logger.error('HTTP GET Failed for: %s' % r.url) logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text)) continue else: info = {} try: info = json.loads(r.text, object_pairs_hook=OrderedDict) except Exception as e: logger.warning("Unable to parse identifier resolution result, a MINID or other supported JSON metadata " "structure was not found. Exception: %s" % get_typed_exception(e)) # need a better way to validate minid response structure locations = info.get('locations', list()) for location in locations: uri = location.get('uri', None) if uri: urls.append(uri) if urls: logger.info("The identifier %s resolved into the following locations: %s" % (identifier, urls)) else: logger.warning("No file locations were found for identifier %s" % identifier) return urls
def ensure_valid_output_path(url, output_path=None): if not output_path: upr = urlsplit(url, allow_fragments=False) output_path = os.path.join(os.curdir, urlunquote(os.path.basename(upr.path))) output_path = os.path.abspath(output_path) output_dir = os.path.dirname(output_path) if not os.path.exists(output_dir): os.makedirs(output_dir) return output_path
def get_file(url, output_path, auth_config, headers=None, session=None): try: if not session: session = get_session(url, auth_config) output_dir = os.path.dirname(os.path.abspath(output_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) if not headers: headers = HEADERS else: headers.update(HEADERS) logger.info("Attempting GET from URL: %s" % url) r = session.get(url, headers=headers, stream=True, verify=certifi.where()) if r.status_code == 401: session = get_session(url, auth_config) r = session.get(url, headers=headers, stream=True, verify=certifi.where()) if r.status_code != 200: logger.error('HTTP GET Failed for URL: %s' % url) logger.error("Host %s responded:\n\n%s" % (urlsplit(url).netloc, r.text)) logger.warning('File transfer failed: [%s]' % output_path) else: total = 0 start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) with open(output_path, 'wb') as data_file: for chunk in r.iter_content(chunk_size=CHUNK_SIZE): data_file.write(chunk) total += len(chunk) elapsed = datetime.datetime.now() - start totalSecs = elapsed.total_seconds() totalMBs = float(total) / float((1024 * 1024)) throughput = str( "%.3f MB/second" % (totalMBs / totalSecs if totalSecs > 0 else 0.001)) logger.info( 'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. ' % (output_path, totalMBs, throughput, elapsed)) return True except requests.exceptions.RequestException as e: logger.error('HTTP Request Exception: %s' % (get_typed_exception(e))) return False
def create_rfm_from_file(args): if not (args.md5_col or args.sha1_col or args.sha256_col or args.sha512_col): raise ValueError( "At least one checksum algorithm column mapping must be specified." ) with open(args.output_file, 'w') as rfm_file, open(args.input_file, 'r') as input_file: rfm = list() if not args.input_format == 'json': dialect = Sniffer().sniff(input_file.read(4096)) input_file.seek(0) rows = DictReader(input_file, dialect=dialect) else: rows = json.load(input_file) for row in rows: if not filter_dict(args.filter, row): continue rfm_entry = dict() rfm_entry["url"] = row[args.url_col] rfm_entry["length"] = int(row[args.length_col]) rfm_entry["filename"] = urlsplit( row[args.filename_col]).path.lstrip("/") if args.md5_col: rfm_entry["md5"] = row[args.md5_col] rfm_entry["md5_base64"] = encode_hex_to_base64( rfm_entry["md5"]) if args.sha1_col: rfm_entry["sha1"] = row[args.sha1_col] rfm_entry["sha1_base64"] = encode_hex_to_base64( rfm_entry["sha1"]) if args.sha256_col: rfm_entry["sha256"] = row[args.sha256_col] rfm_entry["sha256_base64"] = encode_hex_to_base64( rfm_entry["sha256"]) if args.sha512_col: rfm_entry["sha512"] = row[args.sha512_col] rfm_entry["sha512_base64"] = encode_hex_to_base64( rfm_entry["sha512"]) rfm.append(rfm_entry) entries = deduplicate_rfm_entries(rfm) logger.info("Writing %d entries to remote file manifest" % len(entries)) rfm_file.write(json.dumps(entries, sort_keys=True, indent=2)) logger.info("Successfully created remote file manifest: %s" % args.output_file)
def find_resolver(identifier, resolver_config): upr = urlsplit(identifier, allow_fragments=True) scheme = upr.scheme.lower() path = upr.path resolver = None resolvers = resolver_config.get(scheme, []) for resolver in resolvers: prefix = resolver.get("prefix") if prefix and prefix in path.lstrip("/"): break if not resolver: raise RuntimeError( "Unable to locate resolver for identifier scheme: %s" % scheme) resolver_args = resolver.get("args", {}) resolver_class = resolver.get("handler") if not resolver_class: resolver_class = "bdbag.fetch.resolvers.base_resolver.BaseResolverHandler" resolver_args.update({"simple": True}) clazz = None try: module_name, class_name = resolver_class.rsplit(".", 1) try: module = sys.modules[module_name] except KeyError: module = import_module(module_name) clazz = getattr(module, class_name) if module else None except (ImportError, AttributeError): pass if not clazz: raise RuntimeError("Unable to import specified resolver class %s" % resolver_class) return clazz(resolver.get(ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS), resolver_args)
def fetch_file(url, size, path, auth, **kwargs): scheme = urlsplit(url, allow_fragments=True).scheme.lower() if SCHEME_HTTP == scheme or SCHEME_HTTPS == scheme: return fetch_http.get_file(url, path, auth) if SCHEME_FTP == scheme: return fetch_ftp.get_file(url, path, auth) elif SCHEME_GLOBUS == scheme: return fetch_globus.get_file(url, path, auth) elif SCHEME_ARK == scheme or SCHEME_MINID == scheme: resolvers = kwargs.get("resolvers") for url in fetch_identifier.resolve(url, resolvers): if fetch_file(url, size, path, auth): return True return False elif SCHEME_TAG == scheme: logger.info("The fetch entry for file %s specifies the tag URI %s. Tag URIs may represent objects that " "cannot be directly resolvable as network resources and therefore cannot be automatically " "fetched. Such files must be acquired outside of the context of this software." % (path, url)) return True else: logger.warning(UNIMPLEMENTED % scheme) return False
def get_session(url, auth_config): session = None response = None for auth in list((entry for entry in auth_config if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))): try: if not validate_auth_config(auth): continue if auth.uri in SESSIONS: session = SESSIONS[auth.uri] break else: session = get_new_session() if auth.auth_type == 'cookie': if auth.auth_params and hasattr(auth.auth_params, 'cookies'): cookies = auth.auth_params.cookies for cookie in cookies: name, value = cookie.split('=', 1) session.cookies.set(name, value, domain=urlsplit(auth.uri).hostname, path='/') SESSIONS[auth.uri] = session break # if we get here the assumption is that the auth_type is either http-basic or http-form auth_uri = auth.uri if keychain.has_auth_attr(auth, 'auth_uri'): auth_uri = auth.auth_uri if not (keychain.has_auth_attr(auth.auth_params, 'username') and keychain.has_auth_attr(auth.auth_params, 'password')): logging.warning( "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]" % (auth.auth_type, auth.uri)) continue if auth.auth_type == 'http-basic': session.auth = (auth.auth_params.username, auth.auth_params.password) auth_method = "post" if keychain.has_auth_attr(auth.auth_params, 'auth_method'): auth_method = auth.auth_params.auth_method.lower() if auth_method == 'post': response = session.post(auth_uri, auth=session.auth) elif auth_method == 'get': response = session.get(auth_uri, auth=session.auth) else: logging.warning("Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]" % (auth_method, auth.auth_type, auth.uri)) elif auth.auth_type == 'http-form': response = session.post(auth_uri, {auth.auth_params.username_field or "username": auth.auth_params.username, auth.auth_params.password_field or "password": auth.auth_params.password}) if response.status_code > 203: logger.warning( 'Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text)) else: logger.info("Session established: %s", auth.uri) SESSIONS[auth.uri] = session break except Exception as e: logger.warning("Unhandled exception during HTTP(S) authentication: %s" % get_typed_exception(e)) if not session: url_parts = urlsplit(url) base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc)) session = SESSIONS.get(base_url, None) if not session: session = get_new_session() SESSIONS[base_url] = session return session
def generate_remote_file_manifest(args): keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE auth = read_keychain(keychain_file) with open(args.output_file, 'w') as rfm_file, open(args.input_file, 'r') as input_file: rfm = list() for url in input_file.readlines(): rfm_entry = dict() logger.debug("Processing input URL %s" % url) try: headers = headForHeaders(url, auth, raise_for_status=True) except Exception as e: logging.warning("HEAD request failed for URL [%s]: %s" % (url, gte(e))) continue length = headers.get("Content-Length") content_type = headers.get("Content-Type") content_disposition = headers.get("Content-Disposition") md5 = headers.get("Content-MD5") if md5: md5 = decodeBase64toHex(md5) sha256 = headers.get("Content-SHA256") if sha256: sha256 = decodeBase64toHex(sha256) # if content length or both hash values are missing, there is a problem if not length: logging.warning("Could not determine Content-Length for %s" % url) if not (md5 or sha256): logging.warning( "Could not locate an MD5 or SHA256 hash for %s" % url) # try to construct filename using content_disposition, if available, else fallback to the URL path fragment filepath = urlsplit(url).path filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) subdir = args.base_payload_path if args.base_payload_path else "" output_path = ''.join( [subdir, os.path.dirname(filepath), "/", filename]) rfm_entry['url'] = url rfm_entry['length'] = length rfm_entry['filename'] = output_path if md5: rfm_entry['md5'] = md5 if sha256: rfm_entry['sha256'] = sha256 if content_type: rfm_entry["content_type"] = content_type rfm_entry.update({ "metadata": { "title": os.path.basename(rfm_entry["filename"]) } }) if args.streaming_json: rfm_file.writelines(''.join([json.dumps(rfm_entry), '\n'])) else: rfm.append(rfm_entry) if not args.streaming_json: rfm_file.write(json.dumps(rfm, indent=4)) logger.info("Successfully generated remote file manifest: %s" % args.output_file)
def get_session(url, auth_config, config): session = None response = None for auth in keychain.get_auth_entries(url, auth_config): try: if not validate_auth_config(auth): continue uri = auth.get("uri") if uri in SESSIONS: session = SESSIONS[uri] break else: session = init_new_session(config["session_config"]) auth_type = auth.get("auth_type") auth_params = auth.get("auth_params", {}) if auth_type == 'cookie': if auth_params: cookies = auth_params.get("cookies", []) if cookies: for cookie in cookies: name, value = cookie.split('=', 1) session.cookies.set(name, value, domain=urlsplit(uri).hostname, path='/') session.headers.update( auth_params.get("additional_request_headers", {})) SESSIONS[uri] = session break if auth_type == 'bearer-token': token = auth_params.get("token") if token: session.headers.update( {"Authorization": "Bearer " + token}) session.headers.update( auth_params.get("additional_request_headers", {})) SESSIONS[uri] = session break else: logging.warning( "Missing required parameters [token] for auth_type [%s] for keychain entry [%s]" % (auth_type, uri)) # if we get here the assumption is that the auth_type is either http-basic or http-form and that an # actual session "login" request is necessary auth_uri = auth.get("auth_uri", uri) username = auth_params.get("username") password = auth_params.get("password") if not (username and password): logging.warning( "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]" % (auth_type, uri)) continue session.headers.update( auth_params.get("additional_request_headers", {})) auth_method = auth_params.get("auth_method", "post") if auth_type == 'http-basic': session.auth = (username, password) if auth_method: auth_method = auth_method.lower() if auth_method == 'post': response = session.post(auth_uri, auth=session.auth) elif auth_method == 'get': response = session.get(auth_uri, auth=session.auth) else: logging.warning( "Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]" % (auth_method, auth_type, uri)) elif auth_type == 'http-form': username_field = auth_params.get("username_field", "username") password_field = auth_params.get("password_field", "password") response = session.post(auth_uri, { username_field: username, password_field: password }) if response.status_code > 203: logger.warning( 'Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text)) else: logger.info("Session established: %s", uri) SESSIONS[uri] = session break except Exception as e: logger.warning( "Unhandled exception during HTTP(S) authentication: %s" % get_typed_exception(e)) if not session: url_parts = urlsplit(url) base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc)) session = SESSIONS.get(base_url, None) if not session: session = init_new_session(config["session_config"]) SESSIONS[base_url] = session return session
def get_file(url, output_path, auth_config, **kwargs): try: headers = kwargs.get("headers", HEADERS) bdbag_config = kwargs.get("config", DEFAULT_CONFIG) fetch_config = bdbag_config.get(FETCH_CONFIG_TAG, DEFAULT_FETCH_CONFIG) config = fetch_config.get("http", DEFAULT_FETCH_CONFIG["http"]) redirect_status_codes = config.get( FETCH_HTTP_REDIRECT_STATUS_CODES_TAG, DEFAULT_FETCH_HTTP_REDIRECT_STATUS_CODES) session = get_session(url, auth_config, config) output_path = ensure_valid_output_path(url, output_path) allow_redirects = config.get("allow_redirects", False) allow_redirects_with_token = False auth = get_auth(url, auth_config) or {} auth_type = auth.get("auth_type") auth_params = auth.get("auth_params") if auth_type == 'bearer-token': allow_redirects = False # Force setting the "X-Requested-With": "XMLHttpRequest" header is a workaround for some OIDC servers that # on an unauthenticated request redirect to a login flow instead of responding with a 401 Unauthorized. headers.update({"X-Requested-With": "XMLHttpRequest"}) if auth_params: allow_redirects_with_token = stob( auth_params.get("allow_redirects_with_token", False)) while True: logger.info("Attempting GET from URL: %s" % url) r = session.get(url, stream=True, headers=headers, allow_redirects=allow_redirects, verify=certifi.where(), cookies=kwargs.get("cookies")) if r.status_code in redirect_status_codes: url = r.headers['Location'] logger.info("Server responded with redirect to: %s" % url) if auth_type == 'bearer-token': if allow_redirects_with_token: authorization = session.headers.get("Authorization") if authorization: headers.update({"Authorization": authorization}) else: logger.warning( "Unable to locate Authorization header in requests session headers after redirect" ) else: logger.warning( "Authorization bearer token propagation on redirect is disabled for security " "purposes. Enable token propagation for this URL in keychain.json" ) if session.headers.get("Authorization"): del session.headers["Authorization"] continue else: break if r.status_code != 200: logger.error('HTTP GET Failed for URL: %s' % url) logger.error("Host %s responded:\n\n%s" % (urlsplit(url).netloc, r.text)) logger.warning('File transfer failed: [%s]' % output_path) else: total = 0 start = datetime.datetime.now() logger.debug("Transferring file %s to %s" % (url, output_path)) with open(output_path, 'wb') as data_file: for chunk in r.iter_content(chunk_size=CHUNK_SIZE): data_file.write(chunk) total += len(chunk) elapsed_time = datetime.datetime.now() - start summary = get_transfer_summary(total, elapsed_time) logger.info('File [%s] transfer successful. %s' % (output_path, summary)) return output_path except requests.exceptions.RequestException as e: logger.error('HTTP Request Exception: %s' % (get_typed_exception(e))) return None
def create_rfm_from_url_list(args): keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE auth = read_keychain(keychain_file) with open(args.output_file, 'w') as rfm_file, open(args.input_file, 'r') as input_file: rfm = list() for url in input_file.readlines(): rfm_entry = dict() url = url.strip() logger.debug("Processing input URL %s" % url) try: headers = head_for_headers(url, auth, raise_for_status=True) except Exception as e: logging.warning("HEAD request failed for URL [%s]: %s" % (url, gte(e))) continue logger.debug("Result headers: %s" % headers) length = headers.get("Content-Length") content_type = headers.get("Content-Type") content_disposition = headers.get("Content-Disposition") md5_header = args.md5_header if args.md5_header else "Content-MD5" md5 = headers.get(md5_header) md5 = get_checksum_from_string_list("md5", md5) if md5 and not args.disable_hash_decode_base64: rfm_entry["md5_base64"] = md5 md5 = decode_base64_to_hex(md5) rfm_entry["md5"] = md5 sha256_header = args.sha256_header if args.sha256_header else "Content-SHA256" sha256 = headers.get(sha256_header) sha256 = get_checksum_from_string_list("sha256", sha256) if sha256 and not args.disable_hash_decode_base64: rfm_entry["sha256_base64"] = sha256 sha256 = decode_base64_to_hex(sha256) rfm_entry["sha256"] = sha256 # if content length or both hash values are missing, there is a problem if not length: logging.warning("Could not determine Content-Length for %s" % url) if not (md5 or sha256): logging.warning( "Could not locate an MD5 or SHA256 hash for %s" % url) # try to construct filename using content_disposition, if available, else fallback to the URL path fragment filepath = urlsplit(url).path filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) subdir = args.base_payload_path if args.base_payload_path else "" output_path = ''.join( [subdir, os.path.dirname(filepath), "/", filename]) rfm_entry['url'] = url rfm_entry['length'] = length rfm_entry['filename'] = output_path.lstrip("/") if content_type: rfm_entry["content_type"] = content_type if not filter_dict(args.filter, rfm_entry): continue if args.streaming_json: rfm_file.writelines(''.join( [json.dumps(rfm_entry, sort_keys=True), '\n'])) else: rfm.append(rfm_entry) if not args.streaming_json: rfm_file.write( json.dumps(deduplicate_rfm_entries(rfm), sort_keys=True, indent=2)) logger.info("Successfully created remote file manifest: %s" % args.output_file)
def get_file(url, output_path, auth_config, **kwargs): # locate library global globus_sdk if globus_sdk is None: try: globus_sdk = importlib.import_module(globus_sdk_name) except ImportError: pass if globus_sdk is None: raise RuntimeError( "Cannot fetch file using Globus Transfer: unable to find the Globus SDK. " "Ensure that the Python module \"%s\" is installed." % globus_sdk_name) try: src_endpoint = urlsplit(url).hostname src_path = urlsplit(url).path output_path = ensure_valid_output_path(url, output_path) if platform.system() == "Windows": dest_path = ''.join( ('/', output_path.replace('\\', '/').replace(':', ''))) else: dest_path = os.path.abspath(output_path) token, dest_endpoint = get_credentials(url, auth_config) if token is None: logger.warn( "A valid Globus Transfer access token is required to create transfers. " "Check keychain.json for invalid parameters.") return None if dest_endpoint is None: logger.warn( "A valid Globus Transfer destination endpoint must be specified. " "Check keychain.json for invalid parameters.") return None # initialize transfer client authorizer = globus_sdk.AccessTokenAuthorizer(token) client = globus_sdk.TransferClient(authorizer=authorizer) # Activate source endpoint logger.debug("Activating source endpoint: %s" % src_endpoint) data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600) # Activate destination endpoint logger.debug("Activating destination endpoint: %s" % dest_endpoint) data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600) filename = src_path.rsplit('/', 1)[-1] label = "".join(("BDBag Fetch -- ", filename.replace('.', '_'))) # get a unique ID for this transfer tdata = globus_sdk.TransferData(client, src_endpoint, dest_endpoint, label=label) tdata.add_item(src_path, dest_path, recursive=False) # start the transfer data = client.submit_transfer(tdata) task_id = data["task_id"] logger.info("Globus Transfer started with ID %s" % task_id) logger.debug("Transferring file %s to %s" % (url, output_path)) return output_path except Exception as e: logger.error('Globus Transfer request exception: %s' % get_typed_exception(e)) return None
def get_file(url, output_path, auth_config, **kwargs): success = False output_path = ensure_valid_output_path(url, output_path) try: import_boto3() bdbag_config = kwargs.get("config", DEFAULT_CONFIG) fetch_config = bdbag_config.get(FETCH_CONFIG_TAG, DEFAULT_FETCH_CONFIG) config = fetch_config.get("s3", DEFAULT_FETCH_CONFIG["s3"]) credentials = get_credentials(url, auth_config) or {} key = credentials.get("key") secret = credentials.get("secret") token = credentials.get("token") role_arn = credentials.get("role_arn") profile_name = credentials.get("profile") try: session = BOTO3.session.Session(profile_name=profile_name) except Exception as e: raise RuntimeError("Unable to create Boto3 session: %s" % get_typed_exception(e)) if role_arn: try: sts = session.client('sts') response = sts.assume_role(RoleArn=role_arn, RoleSessionName='BDBag-Fetch', DurationSeconds=3600) temp_credentials = response['Credentials'] key = temp_credentials['AccessKeyId'] secret = temp_credentials['SecretAccessKey'] token = temp_credentials['SessionToken'] except Exception as e: raise RuntimeError( "Unable to get temporary credentials using arn [%s]. %s" % (role_arn, get_typed_exception(e))) upr = urlsplit(url, allow_fragments=False) try: if upr.scheme == "gs": endpoint_url = "https://storage.googleapis.com" session_config = BOTO3.session.Config(signature_version="s3v4") kwargs = { "aws_access_key_id": key, "aws_secret_access_key": secret, "endpoint_url": endpoint_url, "config": session_config } else: kwargs = { "aws_access_key_id": key, "aws_secret_access_key": secret } if token: kwargs.update({"aws_session_token": token}) s3_client = session.client("s3", **kwargs) except Exception as e: raise RuntimeError("Unable to create Boto3 storage client: %s" % get_typed_exception(e)) logger.info("Attempting GET from URL: %s" % url) response = s3_client.get_object(Bucket=upr.netloc, Key=upr.path.lstrip("/")) chunk_size = config.get("read_chunk_size", CHUNK_SIZE) max_retries = config.get("max_read_retries", 5) retry_count = 0 total = 0 logger.debug("Transferring file %s to %s" % (url, output_path)) start = datetime.datetime.now() with open(output_path, 'wb') as data_file: stream = response["Body"] stream.set_socket_timeout(config.get("read_timeout_seconds", 120)) chunk = None while True: while retry_count < max_retries: try: chunk = stream.read(chunk_size) break except BOTOCORE.exceptions.ReadTimeoutError as rt: retry_count += 1 logging.warning( "Boto3 read timeout. Retrying attempt %s of %s" % (retry_count, max_retries)) if retry_count == max_retries: raise rt if chunk == b"" or chunk is None: break data_file.write(chunk) total += len(chunk) stream.close() elapsed_time = datetime.datetime.now() - start summary = get_transfer_summary(total, elapsed_time) logger.info('File [%s] transfer successful. %s' % (output_path, summary)) success = True except BOTOCORE.exceptions.ClientError as e: logger.error('Boto3 Client Error: %s' % get_typed_exception(e)) except BOTOCORE.exceptions.BotoCoreError as e: logger.error('Boto3 Error: %s' % get_typed_exception(e)) except Exception as e: logger.error(get_typed_exception(e)) finally: if not success: logger.error('Boto3 GET Failed for URL: %s' % url) logger.warning('File transfer failed: [%s]' % output_path) return output_path if success else None