예제 #1
0
def get_file(url, output_path, auth_config, token=None, dest_endpoint=None):

    try:
        src_endpoint = urlsplit(url).hostname
        src_path = urlsplit(url).path
        if platform.system() == "Windows":
            dest_path = ''.join(('/', output_path.replace('\\', '/').replace(':', '')))
        else:
            dest_path = os.path.abspath(output_path)

        if not token:
            token, dest_endpoint = authenticate(url, auth_config)
        if token is None:
            logger.warn("A valid Globus access token is required to create transfers. "
                        "Check keychain.json for valid parameters.")
            return False

        if dest_endpoint is None:
            logger.warn("A valid Globus destination endpoint must be specified. "
                        "Check keychain.json for valid parameters.")
            return False

        # initialize transfer client
        authorizer = globus_sdk.AccessTokenAuthorizer(token)
        client = globus_sdk.TransferClient(authorizer=authorizer)

        # Activate source endpoint
        logger.debug("Activating source endpoint: %s" % src_endpoint)
        data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600)

        # Activate destination endpoint
        logger.debug("Activating destination endpoint: %s" % dest_endpoint)
        data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600)

        filename = src_path.rsplit('/', 1)[-1]
        label = "".join(("BDBag Fetch -- ", filename.replace('.', '_')))

        # get a unique ID for this transfer
        tdata = globus_sdk.TransferData(client,
                                        src_endpoint,
                                        dest_endpoint,
                                        label=label)

        tdata.add_item(src_path, dest_path, recursive=False)

        # start the transfer
        data = client.submit_transfer(tdata)
        task_id = data["task_id"]

        logger.info("Globus transfer started with ID %s" % task_id)
        logger.debug("Transferring file %s to %s" % (url, output_path))
        return True

    except Exception as e:
        logger.error('Globus transfer request exception: %s' % get_typed_exception(e))

    return False
예제 #2
0
def fetch_file(url, path, auth, **kwargs):

    scheme = urlsplit(url).scheme.lower()
    if SCHEME_HTTP == scheme or SCHEME_HTTPS == scheme:
        return fetch_http.get_file(url, path, auth, **kwargs)
    if SCHEME_FTP == scheme:
        return fetch_ftp.get_file(url, path, auth, **kwargs)
    if SCHEME_S3 == scheme or SCHEME_GS == scheme:
        return fetch_boto3.get_file(url, path, auth, **kwargs)
    if SCHEME_GLOBUS == scheme:
        return fetch_globus.get_file(url, path, auth, **kwargs)
    if SCHEME_TAG == scheme:  # pragma: no cover
        logger.info("The fetch entry for file %s specifies the tag URI %s. Tag URIs may represent objects that "
                    "cannot be directly resolved as network resources and therefore cannot be automatically fetched. "
                    "Such files must be acquired outside of the context of this software." % (path, url))
        return path

    # if we get here, assume the url is an identifier and try to resolve it
    config = kwargs.get("config")
    resolver_config = config.get(RESOLVER_CONFIG_TAG, DEFAULT_RESOLVER_CONFIG) if config else DEFAULT_RESOLVER_CONFIG
    supported_resolvers = resolver_config.keys()
    if scheme in supported_resolvers:
        for entry in resolve(url, resolver_config):
            url = entry.get("url")
            if url:
                output_path = fetch_file(url, path, auth, **kwargs)
                if output_path:
                    return output_path
        return None

    logger.warning(UNIMPLEMENTED % scheme)
    return None
예제 #3
0
def get_file(url, output_path, auth_config, **kwargs):

    try:
        credentials = kwargs.get("credentials")
        if not credentials:
            credentials = get_credentials(url, auth_config)
        output_path = ensure_valid_output_path(url, output_path)
        logger.info("Attempting FTP retrieve from URL: %s" % url)
        creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1] or "*****@*****.**")
        url_parts = urlsplit(url)
        full_url = urlunsplit(
            (url_parts.scheme, "%s%s" % (creds, url_parts.netloc), url_parts.path, url_parts.query, url_parts.fragment))
        start = datetime.datetime.now()
        logger.debug("Transferring file %s to %s" % (url, output_path))
        urlretrieve(full_url, output_path)
        elapsed = datetime.datetime.now() - start
        total = os.path.getsize(output_path)
        summary = get_transfer_summary(total, elapsed)
        logger.info('File [%s] transfer successful. %s' % (output_path, summary))
        return output_path

    except Exception as e:
        logger.error('FTP Request Exception: %s' % (get_typed_exception(e)))
        logger.warning('File transfer failed: [%s]' % output_path)

    return None
예제 #4
0
    def resolve(self, identifier, headers=None):
        if identifier is None:
            return []

        if stob(self.args.get("simple", False)):
            urls = list()
            for identifier_resolver in self.identifier_resolvers:
                urls.append({"url": self.get_resolver_url(identifier, identifier_resolver)})
            return urls

        session = requests.session()
        if headers:
            session.headers = headers
        for resolver in self.identifier_resolvers:
            resolver_url = self.get_resolver_url(identifier, resolver)
            logger.info("Attempting to resolve %s into a valid set of URLs." % identifier)
            r = session.get(resolver_url)
            if r.status_code != 200:
                logger.error('HTTP GET Failed for %s with code: %s' % (r.url, r.status_code))
                logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text))
                continue
            else:
                urls = self.handle_response(r)

            if urls:
                logger.info(
                    "The identifier %s resolved into the following locations: [%s]" %
                    (identifier, ', '.join([url["url"] for url in urls])))
            else:
                logger.warning("No file locations were found for identifier %s" % identifier)

            return urls
예제 #5
0
def get_file(url, output_path, auth_config, credentials=None):

    try:
        if not credentials:
            credentials = get_credentials(url, auth_config)
        output_dir = os.path.dirname(os.path.abspath(output_path))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logger.info("Attempting FTP retrieve from URL: %s" % url)
        creds = "%s:%s@" % (credentials[0] or "anonymous", credentials[1]
                            or "*****@*****.**")
        url_parts = urlsplit(url)
        full_url = urlunsplit(
            (url_parts.scheme, "%s%s" % (creds, url_parts.netloc),
             url_parts.path, url_parts.query, url_parts.fragment))
        start = datetime.datetime.now()
        logger.debug("Transferring file %s to %s" % (url, output_path))
        urlretrieve(full_url, output_path)
        elapsed = datetime.datetime.now() - start
        total = os.path.getsize(output_path)
        totalSecs = elapsed.total_seconds()
        totalMBs = float(total) / float((1024 * 1024))
        throughput = str("%.3f MB/second" %
                         (totalMBs / totalSecs if totalSecs > 0 else 0.001))
        logger.info(
            'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. '
            % (output_path, totalMBs, throughput, elapsed))
        return True

    except Exception as e:
        logger.error('FTP Request Exception: %s' % (get_typed_exception(e)))
        logger.warning('File transfer failed: [%s]' % output_path)

    return False
예제 #6
0
def resolve(identifier, resolvers=DEFAULT_ID_RESOLVERS):
    urls = []
    if identifier is None:
        return urls

    for resolver in resolvers:
        resolver_scheme = "http://" if not (resolver.startswith("http://") or resolver.startswith("https://")) else ''
        resolver_url = ''.join((resolver_scheme, resolver, '/', identifier))
        logger.info("Attempting to resolve %s into a valid set of URLs." % identifier)
        r = requests.get(resolver_url, headers={'accept': 'application/json', 'Connection': 'keep-alive'})
        if r.status_code != 200:
            logger.error('HTTP GET Failed for: %s' % r.url)
            logger.error("Host %s responded:\n\n%s" % (urlsplit(r.url).netloc, r.text))
            continue
        else:
            info = {}
            try:
                info = json.loads(r.text, object_pairs_hook=OrderedDict)
            except Exception as e:
                logger.warning("Unable to parse identifier resolution result, a MINID or other supported JSON metadata "
                               "structure was not found. Exception: %s" % get_typed_exception(e))
            # need a better way to validate minid response structure
            locations = info.get('locations', list())
            for location in locations:
                uri = location.get('uri', None)
                if uri:
                    urls.append(uri)

        if urls:
            logger.info("The identifier %s resolved into the following locations: %s" % (identifier, urls))
        else:
            logger.warning("No file locations were found for identifier %s" % identifier)

        return urls
예제 #7
0
def ensure_valid_output_path(url, output_path=None):
    if not output_path:
        upr = urlsplit(url, allow_fragments=False)
        output_path = os.path.join(os.curdir,
                                   urlunquote(os.path.basename(upr.path)))
    output_path = os.path.abspath(output_path)
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    return output_path
예제 #8
0
def get_file(url, output_path, auth_config, headers=None, session=None):

    try:
        if not session:
            session = get_session(url, auth_config)
        output_dir = os.path.dirname(os.path.abspath(output_path))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not headers:
            headers = HEADERS
        else:
            headers.update(HEADERS)
        logger.info("Attempting GET from URL: %s" % url)
        r = session.get(url,
                        headers=headers,
                        stream=True,
                        verify=certifi.where())
        if r.status_code == 401:
            session = get_session(url, auth_config)
            r = session.get(url,
                            headers=headers,
                            stream=True,
                            verify=certifi.where())
        if r.status_code != 200:
            logger.error('HTTP GET Failed for URL: %s' % url)
            logger.error("Host %s responded:\n\n%s" %
                         (urlsplit(url).netloc, r.text))
            logger.warning('File transfer failed: [%s]' % output_path)
        else:
            total = 0
            start = datetime.datetime.now()
            logger.debug("Transferring file %s to %s" % (url, output_path))
            with open(output_path, 'wb') as data_file:
                for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
                    data_file.write(chunk)
                    total += len(chunk)
            elapsed = datetime.datetime.now() - start
            totalSecs = elapsed.total_seconds()
            totalMBs = float(total) / float((1024 * 1024))
            throughput = str(
                "%.3f MB/second" %
                (totalMBs / totalSecs if totalSecs > 0 else 0.001))
            logger.info(
                'File [%s] transfer successful. %.3f MB transferred at %s. Elapsed time: %s. '
                % (output_path, totalMBs, throughput, elapsed))
            return True

    except requests.exceptions.RequestException as e:
        logger.error('HTTP Request Exception: %s' % (get_typed_exception(e)))

    return False
예제 #9
0
def create_rfm_from_file(args):
    if not (args.md5_col or args.sha1_col or args.sha256_col
            or args.sha512_col):
        raise ValueError(
            "At least one checksum algorithm column mapping must be specified."
        )

    with open(args.output_file, 'w') as rfm_file, open(args.input_file,
                                                       'r') as input_file:
        rfm = list()
        if not args.input_format == 'json':
            dialect = Sniffer().sniff(input_file.read(4096))
            input_file.seek(0)
            rows = DictReader(input_file, dialect=dialect)
        else:
            rows = json.load(input_file)

        for row in rows:
            if not filter_dict(args.filter, row):
                continue
            rfm_entry = dict()
            rfm_entry["url"] = row[args.url_col]
            rfm_entry["length"] = int(row[args.length_col])
            rfm_entry["filename"] = urlsplit(
                row[args.filename_col]).path.lstrip("/")
            if args.md5_col:
                rfm_entry["md5"] = row[args.md5_col]
                rfm_entry["md5_base64"] = encode_hex_to_base64(
                    rfm_entry["md5"])
            if args.sha1_col:
                rfm_entry["sha1"] = row[args.sha1_col]
                rfm_entry["sha1_base64"] = encode_hex_to_base64(
                    rfm_entry["sha1"])
            if args.sha256_col:
                rfm_entry["sha256"] = row[args.sha256_col]
                rfm_entry["sha256_base64"] = encode_hex_to_base64(
                    rfm_entry["sha256"])
            if args.sha512_col:
                rfm_entry["sha512"] = row[args.sha512_col]
                rfm_entry["sha512_base64"] = encode_hex_to_base64(
                    rfm_entry["sha512"])
            rfm.append(rfm_entry)

        entries = deduplicate_rfm_entries(rfm)
        logger.info("Writing %d entries to remote file manifest" %
                    len(entries))
        rfm_file.write(json.dumps(entries, sort_keys=True, indent=2))
        logger.info("Successfully created remote file manifest: %s" %
                    args.output_file)
예제 #10
0
def find_resolver(identifier, resolver_config):

    upr = urlsplit(identifier, allow_fragments=True)
    scheme = upr.scheme.lower()
    path = upr.path

    resolver = None
    resolvers = resolver_config.get(scheme, [])
    for resolver in resolvers:
        prefix = resolver.get("prefix")
        if prefix and prefix in path.lstrip("/"):
            break

    if not resolver:
        raise RuntimeError(
            "Unable to locate resolver for identifier scheme: %s" % scheme)

    resolver_args = resolver.get("args", {})
    resolver_class = resolver.get("handler")
    if not resolver_class:
        resolver_class = "bdbag.fetch.resolvers.base_resolver.BaseResolverHandler"
        resolver_args.update({"simple": True})

    clazz = None
    try:
        module_name, class_name = resolver_class.rsplit(".", 1)
        try:
            module = sys.modules[module_name]
        except KeyError:
            module = import_module(module_name)
        clazz = getattr(module, class_name) if module else None
    except (ImportError, AttributeError):
        pass
    if not clazz:
        raise RuntimeError("Unable to import specified resolver class %s" %
                           resolver_class)

    return clazz(resolver.get(ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS),
                 resolver_args)
예제 #11
0
def fetch_file(url, size, path, auth, **kwargs):

    scheme = urlsplit(url, allow_fragments=True).scheme.lower()
    if SCHEME_HTTP == scheme or SCHEME_HTTPS == scheme:
        return fetch_http.get_file(url, path, auth)
    if SCHEME_FTP == scheme:
        return fetch_ftp.get_file(url, path, auth)
    elif SCHEME_GLOBUS == scheme:
        return fetch_globus.get_file(url, path, auth)
    elif SCHEME_ARK == scheme or SCHEME_MINID == scheme:
        resolvers = kwargs.get("resolvers")
        for url in fetch_identifier.resolve(url, resolvers):
            if fetch_file(url, size, path, auth):
                return True
        return False
    elif SCHEME_TAG == scheme:
        logger.info("The fetch entry for file %s specifies the tag URI %s. Tag URIs may represent objects that "
                    "cannot be directly resolvable as network resources and therefore cannot be automatically "
                    "fetched. Such files must be acquired outside of the context of this software." % (path, url))
        return True
    else:
        logger.warning(UNIMPLEMENTED % scheme)
        return False
예제 #12
0
def get_session(url, auth_config):

    session = None
    response = None

    for auth in list((entry for entry in auth_config if hasattr(entry, 'uri') and (entry.uri.lower() in url.lower()))):

        try:
            if not validate_auth_config(auth):
                continue

            if auth.uri in SESSIONS:
                session = SESSIONS[auth.uri]
                break
            else:
                session = get_new_session()

            if auth.auth_type == 'cookie':
                if auth.auth_params and hasattr(auth.auth_params, 'cookies'):
                    cookies = auth.auth_params.cookies
                    for cookie in cookies:
                        name, value = cookie.split('=', 1)
                        session.cookies.set(name, value, domain=urlsplit(auth.uri).hostname, path='/')
                    SESSIONS[auth.uri] = session
                    break

            # if we get here the assumption is that the auth_type is either http-basic or http-form
            auth_uri = auth.uri
            if keychain.has_auth_attr(auth, 'auth_uri'):
                auth_uri = auth.auth_uri

            if not (keychain.has_auth_attr(auth.auth_params, 'username') and
                    keychain.has_auth_attr(auth.auth_params, 'password')):
                logging.warning(
                    "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]" %
                    (auth.auth_type, auth.uri))
                continue

            if auth.auth_type == 'http-basic':
                session.auth = (auth.auth_params.username, auth.auth_params.password)
                auth_method = "post"
                if keychain.has_auth_attr(auth.auth_params, 'auth_method'):
                    auth_method = auth.auth_params.auth_method.lower()
                if auth_method == 'post':
                    response = session.post(auth_uri, auth=session.auth)
                elif auth_method == 'get':
                    response = session.get(auth_uri, auth=session.auth)
                else:
                    logging.warning("Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]" %
                                    (auth_method, auth.auth_type, auth.uri))
            elif auth.auth_type == 'http-form':
                response = session.post(auth_uri,
                                        {auth.auth_params.username_field or "username": auth.auth_params.username,
                                         auth.auth_params.password_field or "password": auth.auth_params.password})
            if response.status_code > 203:
                logger.warning(
                    'Authentication failed with Status Code: %s %s\n' % (response.status_code, response.text))
            else:
                logger.info("Session established: %s", auth.uri)
                SESSIONS[auth.uri] = session
                break

        except Exception as e:
            logger.warning("Unhandled exception during HTTP(S) authentication: %s" % get_typed_exception(e))

    if not session:
        url_parts = urlsplit(url)
        base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc))
        session = SESSIONS.get(base_url, None)
        if not session:
            session = get_new_session()
            SESSIONS[base_url] = session

    return session
예제 #13
0
def generate_remote_file_manifest(args):
    keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE
    auth = read_keychain(keychain_file)
    with open(args.output_file, 'w') as rfm_file, open(args.input_file,
                                                       'r') as input_file:
        rfm = list()
        for url in input_file.readlines():
            rfm_entry = dict()
            logger.debug("Processing input URL %s" % url)
            try:
                headers = headForHeaders(url, auth, raise_for_status=True)
            except Exception as e:
                logging.warning("HEAD request failed for URL [%s]: %s" %
                                (url, gte(e)))
                continue
            length = headers.get("Content-Length")
            content_type = headers.get("Content-Type")
            content_disposition = headers.get("Content-Disposition")
            md5 = headers.get("Content-MD5")
            if md5:
                md5 = decodeBase64toHex(md5)
            sha256 = headers.get("Content-SHA256")
            if sha256:
                sha256 = decodeBase64toHex(sha256)

            # if content length or both hash values are missing, there is a problem
            if not length:
                logging.warning("Could not determine Content-Length for %s" %
                                url)
            if not (md5 or sha256):
                logging.warning(
                    "Could not locate an MD5 or SHA256 hash for %s" % url)

            # try to construct filename using content_disposition, if available, else fallback to the URL path fragment
            filepath = urlsplit(url).path
            filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \
                parse_content_disposition(content_disposition)
            subdir = args.base_payload_path if args.base_payload_path else ""
            output_path = ''.join(
                [subdir, os.path.dirname(filepath), "/", filename])

            rfm_entry['url'] = url
            rfm_entry['length'] = length
            rfm_entry['filename'] = output_path
            if md5:
                rfm_entry['md5'] = md5
            if sha256:
                rfm_entry['sha256'] = sha256
            if content_type:
                rfm_entry["content_type"] = content_type
            rfm_entry.update({
                "metadata": {
                    "title": os.path.basename(rfm_entry["filename"])
                }
            })

            if args.streaming_json:
                rfm_file.writelines(''.join([json.dumps(rfm_entry), '\n']))
            else:
                rfm.append(rfm_entry)
        if not args.streaming_json:
            rfm_file.write(json.dumps(rfm, indent=4))
        logger.info("Successfully generated remote file manifest: %s" %
                    args.output_file)
예제 #14
0
def get_session(url, auth_config, config):

    session = None
    response = None

    for auth in keychain.get_auth_entries(url, auth_config):
        try:
            if not validate_auth_config(auth):
                continue

            uri = auth.get("uri")
            if uri in SESSIONS:
                session = SESSIONS[uri]
                break
            else:
                session = init_new_session(config["session_config"])

            auth_type = auth.get("auth_type")
            auth_params = auth.get("auth_params", {})

            if auth_type == 'cookie':
                if auth_params:
                    cookies = auth_params.get("cookies", [])
                    if cookies:
                        for cookie in cookies:
                            name, value = cookie.split('=', 1)
                            session.cookies.set(name,
                                                value,
                                                domain=urlsplit(uri).hostname,
                                                path='/')
                    session.headers.update(
                        auth_params.get("additional_request_headers", {}))
                    SESSIONS[uri] = session
                    break

            if auth_type == 'bearer-token':
                token = auth_params.get("token")
                if token:
                    session.headers.update(
                        {"Authorization": "Bearer " + token})
                    session.headers.update(
                        auth_params.get("additional_request_headers", {}))
                    SESSIONS[uri] = session
                    break
                else:
                    logging.warning(
                        "Missing required parameters [token] for auth_type [%s] for keychain entry [%s]"
                        % (auth_type, uri))

            # if we get here the assumption is that the auth_type is either http-basic or http-form and that an
            # actual session "login" request is necessary
            auth_uri = auth.get("auth_uri", uri)
            username = auth_params.get("username")
            password = auth_params.get("password")
            if not (username and password):
                logging.warning(
                    "Missing required parameters [username, password] for auth_type [%s] for keychain entry [%s]"
                    % (auth_type, uri))
                continue

            session.headers.update(
                auth_params.get("additional_request_headers", {}))

            auth_method = auth_params.get("auth_method", "post")
            if auth_type == 'http-basic':
                session.auth = (username, password)
                if auth_method:
                    auth_method = auth_method.lower()
                if auth_method == 'post':
                    response = session.post(auth_uri, auth=session.auth)
                elif auth_method == 'get':
                    response = session.get(auth_uri, auth=session.auth)
                else:
                    logging.warning(
                        "Unsupported auth_method [%s] for auth_type [%s] for keychain entry [%s]"
                        % (auth_method, auth_type, uri))
            elif auth_type == 'http-form':
                username_field = auth_params.get("username_field", "username")
                password_field = auth_params.get("password_field", "password")
                response = session.post(auth_uri, {
                    username_field: username,
                    password_field: password
                })
            if response.status_code > 203:
                logger.warning(
                    'Authentication failed with Status Code: %s %s\n' %
                    (response.status_code, response.text))
            else:
                logger.info("Session established: %s", uri)
                SESSIONS[uri] = session
                break

        except Exception as e:
            logger.warning(
                "Unhandled exception during HTTP(S) authentication: %s" %
                get_typed_exception(e))

    if not session:
        url_parts = urlsplit(url)
        base_url = str("%s://%s" % (url_parts.scheme, url_parts.netloc))
        session = SESSIONS.get(base_url, None)
        if not session:
            session = init_new_session(config["session_config"])
            SESSIONS[base_url] = session

    return session
예제 #15
0
def get_file(url, output_path, auth_config, **kwargs):

    try:
        headers = kwargs.get("headers", HEADERS)
        bdbag_config = kwargs.get("config", DEFAULT_CONFIG)
        fetch_config = bdbag_config.get(FETCH_CONFIG_TAG, DEFAULT_FETCH_CONFIG)
        config = fetch_config.get("http", DEFAULT_FETCH_CONFIG["http"])
        redirect_status_codes = config.get(
            FETCH_HTTP_REDIRECT_STATUS_CODES_TAG,
            DEFAULT_FETCH_HTTP_REDIRECT_STATUS_CODES)

        session = get_session(url, auth_config, config)
        output_path = ensure_valid_output_path(url, output_path)

        allow_redirects = config.get("allow_redirects", False)
        allow_redirects_with_token = False
        auth = get_auth(url, auth_config) or {}
        auth_type = auth.get("auth_type")
        auth_params = auth.get("auth_params")
        if auth_type == 'bearer-token':
            allow_redirects = False
            # Force setting the "X-Requested-With": "XMLHttpRequest" header is a workaround for some OIDC servers that
            # on an unauthenticated request redirect to a login flow instead of responding with a 401 Unauthorized.
            headers.update({"X-Requested-With": "XMLHttpRequest"})
            if auth_params:
                allow_redirects_with_token = stob(
                    auth_params.get("allow_redirects_with_token", False))

        while True:
            logger.info("Attempting GET from URL: %s" % url)
            r = session.get(url,
                            stream=True,
                            headers=headers,
                            allow_redirects=allow_redirects,
                            verify=certifi.where(),
                            cookies=kwargs.get("cookies"))
            if r.status_code in redirect_status_codes:
                url = r.headers['Location']
                logger.info("Server responded with redirect to: %s" % url)
                if auth_type == 'bearer-token':
                    if allow_redirects_with_token:
                        authorization = session.headers.get("Authorization")
                        if authorization:
                            headers.update({"Authorization": authorization})
                        else:
                            logger.warning(
                                "Unable to locate Authorization header in requests session headers after redirect"
                            )
                    else:
                        logger.warning(
                            "Authorization bearer token propagation on redirect is disabled for security "
                            "purposes. Enable token propagation for this URL in keychain.json"
                        )
                        if session.headers.get("Authorization"):
                            del session.headers["Authorization"]
                continue
            else:
                break

        if r.status_code != 200:
            logger.error('HTTP GET Failed for URL: %s' % url)
            logger.error("Host %s responded:\n\n%s" %
                         (urlsplit(url).netloc, r.text))
            logger.warning('File transfer failed: [%s]' % output_path)
        else:
            total = 0
            start = datetime.datetime.now()
            logger.debug("Transferring file %s to %s" % (url, output_path))
            with open(output_path, 'wb') as data_file:
                for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
                    data_file.write(chunk)
                    total += len(chunk)
            elapsed_time = datetime.datetime.now() - start
            summary = get_transfer_summary(total, elapsed_time)
            logger.info('File [%s] transfer successful. %s' %
                        (output_path, summary))
            return output_path

    except requests.exceptions.RequestException as e:
        logger.error('HTTP Request Exception: %s' % (get_typed_exception(e)))

    return None
예제 #16
0
def create_rfm_from_url_list(args):
    keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE
    auth = read_keychain(keychain_file)
    with open(args.output_file, 'w') as rfm_file, open(args.input_file,
                                                       'r') as input_file:
        rfm = list()
        for url in input_file.readlines():
            rfm_entry = dict()
            url = url.strip()
            logger.debug("Processing input URL %s" % url)
            try:
                headers = head_for_headers(url, auth, raise_for_status=True)
            except Exception as e:
                logging.warning("HEAD request failed for URL [%s]: %s" %
                                (url, gte(e)))
                continue
            logger.debug("Result headers: %s" % headers)
            length = headers.get("Content-Length")
            content_type = headers.get("Content-Type")
            content_disposition = headers.get("Content-Disposition")
            md5_header = args.md5_header if args.md5_header else "Content-MD5"
            md5 = headers.get(md5_header)
            md5 = get_checksum_from_string_list("md5", md5)
            if md5 and not args.disable_hash_decode_base64:
                rfm_entry["md5_base64"] = md5
                md5 = decode_base64_to_hex(md5)
                rfm_entry["md5"] = md5
            sha256_header = args.sha256_header if args.sha256_header else "Content-SHA256"
            sha256 = headers.get(sha256_header)
            sha256 = get_checksum_from_string_list("sha256", sha256)
            if sha256 and not args.disable_hash_decode_base64:
                rfm_entry["sha256_base64"] = sha256
                sha256 = decode_base64_to_hex(sha256)
                rfm_entry["sha256"] = sha256

            # if content length or both hash values are missing, there is a problem
            if not length:
                logging.warning("Could not determine Content-Length for %s" %
                                url)
            if not (md5 or sha256):
                logging.warning(
                    "Could not locate an MD5 or SHA256 hash for %s" % url)

            # try to construct filename using content_disposition, if available, else fallback to the URL path fragment
            filepath = urlsplit(url).path
            filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \
                parse_content_disposition(content_disposition)
            subdir = args.base_payload_path if args.base_payload_path else ""
            output_path = ''.join(
                [subdir, os.path.dirname(filepath), "/", filename])

            rfm_entry['url'] = url
            rfm_entry['length'] = length
            rfm_entry['filename'] = output_path.lstrip("/")
            if content_type:
                rfm_entry["content_type"] = content_type

            if not filter_dict(args.filter, rfm_entry):
                continue

            if args.streaming_json:
                rfm_file.writelines(''.join(
                    [json.dumps(rfm_entry, sort_keys=True), '\n']))
            else:
                rfm.append(rfm_entry)
        if not args.streaming_json:
            rfm_file.write(
                json.dumps(deduplicate_rfm_entries(rfm),
                           sort_keys=True,
                           indent=2))
        logger.info("Successfully created remote file manifest: %s" %
                    args.output_file)
예제 #17
0
def get_file(url, output_path, auth_config, **kwargs):

    # locate library
    global globus_sdk
    if globus_sdk is None:
        try:
            globus_sdk = importlib.import_module(globus_sdk_name)
        except ImportError:
            pass
    if globus_sdk is None:
        raise RuntimeError(
            "Cannot fetch file using Globus Transfer: unable to find the Globus SDK. "
            "Ensure that the Python module \"%s\" is installed." %
            globus_sdk_name)

    try:
        src_endpoint = urlsplit(url).hostname
        src_path = urlsplit(url).path
        output_path = ensure_valid_output_path(url, output_path)
        if platform.system() == "Windows":
            dest_path = ''.join(
                ('/', output_path.replace('\\', '/').replace(':', '')))
        else:
            dest_path = os.path.abspath(output_path)

        token, dest_endpoint = get_credentials(url, auth_config)
        if token is None:
            logger.warn(
                "A valid Globus Transfer access token is required to create transfers. "
                "Check keychain.json for invalid parameters.")
            return None

        if dest_endpoint is None:
            logger.warn(
                "A valid Globus Transfer destination endpoint must be specified. "
                "Check keychain.json for invalid parameters.")
            return None

        # initialize transfer client
        authorizer = globus_sdk.AccessTokenAuthorizer(token)
        client = globus_sdk.TransferClient(authorizer=authorizer)

        # Activate source endpoint
        logger.debug("Activating source endpoint: %s" % src_endpoint)
        data = client.endpoint_autoactivate(src_endpoint, if_expires_in=600)

        # Activate destination endpoint
        logger.debug("Activating destination endpoint: %s" % dest_endpoint)
        data = client.endpoint_autoactivate(dest_endpoint, if_expires_in=600)

        filename = src_path.rsplit('/', 1)[-1]
        label = "".join(("BDBag Fetch -- ", filename.replace('.', '_')))

        # get a unique ID for this transfer
        tdata = globus_sdk.TransferData(client,
                                        src_endpoint,
                                        dest_endpoint,
                                        label=label)

        tdata.add_item(src_path, dest_path, recursive=False)

        # start the transfer
        data = client.submit_transfer(tdata)
        task_id = data["task_id"]

        logger.info("Globus Transfer started with ID %s" % task_id)
        logger.debug("Transferring file %s to %s" % (url, output_path))
        return output_path

    except Exception as e:
        logger.error('Globus Transfer request exception: %s' %
                     get_typed_exception(e))

    return None
예제 #18
0
def get_file(url, output_path, auth_config, **kwargs):
    success = False
    output_path = ensure_valid_output_path(url, output_path)

    try:
        import_boto3()

        bdbag_config = kwargs.get("config", DEFAULT_CONFIG)
        fetch_config = bdbag_config.get(FETCH_CONFIG_TAG, DEFAULT_FETCH_CONFIG)
        config = fetch_config.get("s3", DEFAULT_FETCH_CONFIG["s3"])
        credentials = get_credentials(url, auth_config) or {}
        key = credentials.get("key")
        secret = credentials.get("secret")
        token = credentials.get("token")
        role_arn = credentials.get("role_arn")
        profile_name = credentials.get("profile")

        try:
            session = BOTO3.session.Session(profile_name=profile_name)
        except Exception as e:
            raise RuntimeError("Unable to create Boto3 session: %s" %
                               get_typed_exception(e))

        if role_arn:
            try:
                sts = session.client('sts')
                response = sts.assume_role(RoleArn=role_arn,
                                           RoleSessionName='BDBag-Fetch',
                                           DurationSeconds=3600)
                temp_credentials = response['Credentials']
                key = temp_credentials['AccessKeyId']
                secret = temp_credentials['SecretAccessKey']
                token = temp_credentials['SessionToken']
            except Exception as e:
                raise RuntimeError(
                    "Unable to get temporary credentials using arn [%s]. %s" %
                    (role_arn, get_typed_exception(e)))

        upr = urlsplit(url, allow_fragments=False)
        try:
            if upr.scheme == "gs":
                endpoint_url = "https://storage.googleapis.com"
                session_config = BOTO3.session.Config(signature_version="s3v4")
                kwargs = {
                    "aws_access_key_id": key,
                    "aws_secret_access_key": secret,
                    "endpoint_url": endpoint_url,
                    "config": session_config
                }
            else:
                kwargs = {
                    "aws_access_key_id": key,
                    "aws_secret_access_key": secret
                }
                if token:
                    kwargs.update({"aws_session_token": token})
            s3_client = session.client("s3", **kwargs)
        except Exception as e:
            raise RuntimeError("Unable to create Boto3 storage client: %s" %
                               get_typed_exception(e))

        logger.info("Attempting GET from URL: %s" % url)
        response = s3_client.get_object(Bucket=upr.netloc,
                                        Key=upr.path.lstrip("/"))
        chunk_size = config.get("read_chunk_size", CHUNK_SIZE)
        max_retries = config.get("max_read_retries", 5)
        retry_count = 0
        total = 0

        logger.debug("Transferring file %s to %s" % (url, output_path))
        start = datetime.datetime.now()
        with open(output_path, 'wb') as data_file:
            stream = response["Body"]
            stream.set_socket_timeout(config.get("read_timeout_seconds", 120))
            chunk = None
            while True:
                while retry_count < max_retries:
                    try:
                        chunk = stream.read(chunk_size)
                        break
                    except BOTOCORE.exceptions.ReadTimeoutError as rt:
                        retry_count += 1
                        logging.warning(
                            "Boto3 read timeout. Retrying attempt %s of %s" %
                            (retry_count, max_retries))
                        if retry_count == max_retries:
                            raise rt
                if chunk == b"" or chunk is None:
                    break
                data_file.write(chunk)
                total += len(chunk)
            stream.close()
        elapsed_time = datetime.datetime.now() - start
        summary = get_transfer_summary(total, elapsed_time)
        logger.info('File [%s] transfer successful. %s' %
                    (output_path, summary))
        success = True
    except BOTOCORE.exceptions.ClientError as e:
        logger.error('Boto3 Client Error: %s' % get_typed_exception(e))
    except BOTOCORE.exceptions.BotoCoreError as e:
        logger.error('Boto3 Error: %s' % get_typed_exception(e))
    except Exception as e:
        logger.error(get_typed_exception(e))
    finally:
        if not success:
            logger.error('Boto3 GET Failed for URL: %s' % url)
            logger.warning('File transfer failed: [%s]' % output_path)

    return output_path if success else None