def index_document(self): indexd_server = config.get("INDEXD") or config["BASE_URL"] + "/index" url = indexd_server + "/index/" try: res = requests.get(url + self.file_id) except Exception as e: logger.error( "failed to reach indexd at {0}: {1}".format(url + self.file_id, e) ) raise UnavailableError("Fail to reach id service to find data location") if res.status_code == 200: try: json_response = res.json() if "urls" not in json_response: logger.error( "URLs are not included in response from " "indexd: {}".format(url + self.file_id) ) raise InternalError("URLs and metadata not found") return res.json() except Exception as e: logger.error( "indexd response missing JSON field {}".format(url + self.file_id) ) raise InternalError("internal error from indexd: {}".format(e)) elif res.status_code == 404: logger.error( "Not Found. indexd could not find {}: {}".format( url + self.file_id, res.text ) ) raise NotFound("No indexed document found with id {}".format(self.file_id)) else: raise UnavailableError(res.text)
def delete_data_file(self, bucket, guid): """ We use buckets with versioning disabled. See AWS docs here: https://docs.aws.amazon.com/AmazonS3/latest/dev/DeletingObjectsfromVersioningSuspendedBuckets.html """ try: s3_objects = self.s3_client.list_objects_v2( Bucket=bucket, Prefix=guid, Delimiter="/" ) if not s3_objects.get("Contents"): # file not found in the bucket self.logger.info( "tried to delete GUID {} but didn't find in bucket {}".format( guid, bucket ) ) return if len(s3_objects["Contents"]) > 1: raise InternalError("multiple files found with GUID {}".format(guid)) key = s3_objects["Contents"][0]["Key"] self.s3_client.delete_object(Bucket=bucket, Key=key) self.logger.info( "deleted file for GUID {} in bucket {}".format(guid, bucket) ) except (KeyError, Boto3Error) as e: self.logger.exception(e) raise InternalError("Failed to delete file: {}".format(str(e)))
def get_credential_to_access_bucket(self, aws_creds, expires_in): s3_buckets = get_value( flask.current_app.config, "S3_BUCKETS", InternalError("buckets not configured"), ) if len(aws_creds) == 0 and len(s3_buckets) == 0: raise InternalError("no bucket is configured") if len(aws_creds) == 0 and len(s3_buckets) > 0: raise InternalError("credential for buckets is not configured") bucket_cred = None for pattern in s3_buckets: if re.match("^" + pattern + "$", self.parsed_url.netloc): bucket_cred = s3_buckets[pattern] break if bucket_cred is None: raise Unauthorized("permission denied for bucket") cred_key = get_value( bucket_cred, "cred", InternalError("credential of that bucket is missing")) if cred_key == "*": return {"aws_access_key_id": "*"} if "role-arn" not in bucket_cred: return get_value( aws_creds, cred_key, InternalError("aws credential of that bucket is not found"), ) else: return S3IndexedFileLocation.assume_role(aws_creds, bucket_cred, cred_key, expires_in)
def assume_role(cls, bucket_cred, expires_in, aws_creds_config): role_arn = get_value( bucket_cred, "role-arn", InternalError("role-arn of that bucket is missing")) assumed_role = flask.current_app.boto.assume_role( role_arn, expires_in, aws_creds_config) cred = get_value(assumed_role, "Credentials", InternalError("fail to assume role")) return { "aws_access_key_id": get_value( cred, "AccessKeyId", InternalError("outdated format. AccessKeyId missing"), ), "aws_secret_access_key": get_value( cred, "SecretAccessKey", InternalError("outdated format. SecretAccessKey missing"), ), "aws_session_token": get_value( cred, "SessionToken", InternalError("outdated format. Sesssion token missing"), ), }
def get_index_document(file_id): indexd_server = (current_app.config.get('INDEXD') or current_app.config['BASE_URL'] + '/index') url = indexd_server + '/index/' try: res = requests.get(url + file_id) except Exception as e: current_app.logger.error("failed to reach indexd at {0}: {1}".format( url + file_id, e)) raise UnavailableError( "Fail to reach id service to find data location") if res.status_code == 200: try: json_response = res.json() if 'urls' not in json_response or 'metadata' not in json_response: current_app.logger.error( 'URLs and metadata are not included in response from indexd: {}' .format(url + file_id)) raise InternalError('URLs and metadata not found') return res.json() except Exception as e: flask.current_app.logger.error( 'indexd response missing JSON field {}'.format(url + file_id)) raise InternalError('internal error from indexd: {}'.format(e)) elif res.status_code == 404: flask.current_app.logger.error( 'indexd did not find find {}; {}'.format(url + file_id, res.text)) raise NotFound("Can't find a location for the data") else: raise UnavailableError(res.text)
def get_signed_url( self, action, expires_in, public_data=False, force_signed_url=True, **kwargs ): aws_creds = get_value( config, "AWS_CREDENTIALS", InternalError("credentials not configured") ) s3_buckets = get_value( config, "S3_BUCKETS", InternalError("buckets not configured") ) bucket_name = self.bucket_name() bucket = s3_buckets.get(bucket_name) if bucket and bucket.get("endpoint_url"): http_url = bucket["endpoint_url"].strip("/") + "/{}/{}".format( self.parsed_url.netloc, self.parsed_url.path.strip("/") ) else: http_url = "https://{}.s3.amazonaws.com/{}".format( self.parsed_url.netloc, self.parsed_url.path.strip("/") ) credential = S3IndexedFileLocation.get_credential_to_access_bucket( bucket_name, aws_creds, expires_in ) # if it's public and we don't need to force the signed url, just return the raw # s3 url aws_access_key_id = get_value( credential, "aws_access_key_id", InternalError("aws configuration not found"), ) # `aws_access_key_id == "*"` is a special case to support public buckets # where we do *not* want to try signing at all. the other case is that the # data is public and user requested to not sign the url if aws_access_key_id == "*" or (public_data and not force_signed_url): return http_url region = self.get_bucket_region() if not region and not bucket.get("endpoint_url"): region = flask.current_app.boto.get_bucket_region( self.parsed_url.netloc, credential ) user_info = _get_user_info() url = generate_aws_presigned_url( http_url, ACTION_DICT["s3"][action], credential, "s3", region, expires_in, user_info, ) return url
def make_signed_url(self, file_name, expires_in=None): """ Works for upload only; S3 only (only supported case for data upload flow currently). Args: file_name (str) expires_in (int) Return: S3IndexedFileLocation """ try: bucket = flask.current_app.config["DATA_UPLOAD_BUCKET"] except KeyError: raise InternalError( "fence not configured with data upload bucket; can't create signed URL" ) s3_url = "s3://{}/{}/{}".format(bucket, self.guid, file_name) url = S3IndexedFileLocation(s3_url).get_signed_url("upload", expires_in) self.logger.info( "created presigned URL to upload file {} with ID {}".format( file_name, self.guid ) ) return url
def generate_presigned_url_for_uploading_part( bucket, key, credentials, uploadId, partNumber, region, expires ): """ Generate presigned url for uploading object part given uploadId and part number Args: bucket(str): bucket key(str): key credentials(dict): dictionary of aws credentials uploadId(str): uploadID of the multipart upload partNumber(int): part number region(str): bucket region expires(int): expiration time Returns: presigned_url(str) """ url = "https://{}.s3.amazonaws.com/{}".format(bucket, key) additional_signed_qs = {"partNumber": str(partNumber), "uploadId": uploadId} try: return generate_aws_presigned_url( url, "PUT", credentials, "s3", region, expires, additional_signed_qs ) except Exception as e: raise InternalError( "Can not generate presigned url for part number {} of key {}. Detail {}".format( partNumber, key, e ) )
def index_document(self): """ Get the record from indexd for this index. Return: dict: response from indexd (the contents of the record), containing ``guid`` and ``url`` """ index_url = self.indexd.rstrip("/") + "/index/blank/" params = {"uploader": self.uploader, "file_name": self.file_name} auth = (config["INDEXD_USERNAME"], config["INDEXD_PASSWORD"]) indexd_response = requests.post(index_url, json=params, auth=auth) if indexd_response.status_code not in [200, 201]: try: data = indexd_response.json() except ValueError: data = indexd_response.text self.logger.error( "could not create new record in indexd; got response: {}". format(data)) raise InternalError( "received error from indexd trying to create blank record") document = indexd_response.json() guid = document["did"] self.logger.info( "created blank index record with GUID {} for upload".format(guid)) return document
def default_login(): """ The default root login route. """ def absolute_login_url(provider_id): base_url = config["BASE_URL"].rstrip("/") return base_url + "/login/{}".format(IDP_URL_MAP[provider_id]) def provider_info(idp_id): if not idp_id: return { "id": None, "name": None, "url": None, "desc": None, "secondary": False, } return { "id": idp_id, "name": idps[idp_id]["name"], "url": absolute_login_url(idp_id), "desc": idps[idp_id].get("desc", None), "secondary": idps[idp_id].get("secondary", False), } try: all_provider_info = [provider_info(idp_id) for idp_id in list(idps.keys())] default_provider_info = provider_info(default_idp) except KeyError as e: raise InternalError("identity providers misconfigured: {}".format(str(e))) return flask.jsonify( {"default_provider": default_provider_info, "providers": all_provider_info} )
def generate_presigne_url_for_part_upload(self, uploadId, partNumber, expires_in): """ Generate presigned url for uploading object part given uploadId and part number Args: uploadId(str): uploadID of the multipart upload partNumber(int): part number expires(int): expiration time Returns: presigned_url(str) """ aws_creds = get_value( config, "AWS_CREDENTIALS", InternalError("credentials not configured") ) credential = S3IndexedFileLocation.get_credential_to_access_bucket( self.bucket_name(), aws_creds, expires_in ) region = self.get_bucket_region() if not region: region = flask.current_app.boto.get_bucket_region( self.parsed_url.netloc, credential ) return multipart_upload.generate_presigned_url_for_uploading_part( self.parsed_url.netloc, self.parsed_url.path.strip("/"), credential, uploadId, partNumber, region, expires_in, )
def get_user_accesses(): user = (current_session.query(User).join( User.research_groups).filter(User.id == flask.g.user.id)) if not user: raise InternalError('Error: %s user does not exist' % flask.g.user.username) return user
def initilize_multipart_upload(bucket, key, credentials): """ Initialize multipart upload Args: bucket(str): bucket name key(str): object key credentials(dict): credential dictionary Returns: UploadId(str): uploadId """ session = boto3.Session( aws_access_key_id=credentials["aws_access_key_id"], aws_secret_access_key=credentials["aws_secret_access_key"], aws_session_token=credentials.get("aws_session_token"), ) s3client = session.client("s3") try: multipart_upload = retry_call( s3client.create_multipart_upload, fkwargs={"Bucket": bucket, "Key": key}, tries=MAX_TRIES, jitter=10, ) except botocore.exceptions.ClientError as error: logger.error( "Error when create multiple part upload for object with uuid {}. Detail {}".format( key, error ) ) raise InternalError("Can not initilize multipart upload for {}".format(key)) return multipart_upload.get("UploadId")
def absolute_login_url(provider_id, fence_idp=None, shib_idp=None): """ Args: provider_id (str): provider to log in with; an IDP_URL_MAP key. fence_idp (str, optional): if provider_id is "fence" (multi-tenant Fence setup), fence_idp can be any of the providers supported by the other Fence. If not specified, will default to NIH login. shib_idp (str, optional): if provider_id is "fence" and fence_idp is "shibboleth", shib_idp can be any Shibboleth/ InCommon provider. If not specified, will default to NIH login. Returns: str: login URL for this provider, including extra query parameters if fence_idp and/or shib_idp are specified. """ try: base_url = config["BASE_URL"].rstrip("/") login_url = base_url + "/login/{}".format( IDP_URL_MAP[provider_id]) except KeyError as e: raise InternalError( "identity provider misconfigured: {}".format(str(e))) params = {} if fence_idp: params["idp"] = fence_idp if shib_idp: params["shib_idp"] = shib_idp login_url = add_params_to_uri(login_url, params) return login_url
def get(self): """ Complete the shibboleth login. """ shib_header = config.get("SHIBBOLETH_HEADER") if not shib_header: raise InternalError("Missing shibboleth header configuration") # eppn stands for eduPersonPrincipalName username = flask.request.headers.get("eppn") entityID = flask.session.get("entityID") # if eppn not available or logging in through NIH if not username or not entityID or entityID == "urn:mace:incommon:nih.gov": persistent_id = flask.request.headers.get(shib_header) username = persistent_id.split("!")[-1] if persistent_id else None if not username: # some inCommon providers are not returning eppn # or persistent_id. See PXP-4309 # print("shib_header", shib_header) # print("flask.request.headers", flask.request.headers) raise Unauthorized("Unable to retrieve username") idp = IdentityProvider.itrust if entityID: idp = entityID login_user(flask.request, username, idp) if flask.session.get("redirect"): return flask.redirect(flask.session.get("redirect")) return "logged in"
def get_login_providers_info(): # default login option if config.get("DEFAULT_LOGIN_IDP"): default_idp = config["DEFAULT_LOGIN_IDP"] elif "default" in (config.get("ENABLED_IDENTITY_PROVIDERS") or {}): # fall back on ENABLED_IDENTITY_PROVIDERS.default default_idp = config["ENABLED_IDENTITY_PROVIDERS"]["default"] else: logger.warning("DEFAULT_LOGIN_IDP not configured") default_idp = None # other login options if config["LOGIN_OPTIONS"]: login_options = config["LOGIN_OPTIONS"] elif "providers" in (config.get("ENABLED_IDENTITY_PROVIDERS") or {}): # fall back on "providers" and convert to "login_options" format enabled_providers = config["ENABLED_IDENTITY_PROVIDERS"]["providers"] login_options = [{ "name": details.get("name"), "idp": idp, "desc": details.get("desc"), "secondary": details.get("secondary"), } for idp, details in enabled_providers.items()] else: logger.warning("LOGIN_OPTIONS not configured or empty") login_options = [] try: all_provider_info = [ provider_info(login_details) for login_details in login_options ] except KeyError as e: raise InternalError( "LOGIN_OPTIONS misconfigured: cannot find key {}".format(e)) # if several login_options are defined for this default IDP, will # default to the first one: default_provider_info = next( (info for info in all_provider_info if info["idp"] == default_idp), None) if not default_provider_info: raise InternalError( "default provider misconfigured: DEFAULT_LOGIN_IDP is set to {}, which is not configured in LOGIN_OPTIONS" .format(default_idp)) return default_provider_info, all_provider_info
def get_endpoints_descriptions(providers, session): desc = {} for provider in providers: if provider == "cdis": desc["/cdis"] = "access to Gen3 APIs" else: p = session.query(CloudProvider).filter_by(name=provider).first() if p is None: raise InternalError( "{} is not supported by the system!".format(provider)) desc["/" + provider] = p.description or "" return desc
def check_response(self, resp, body): # The audit-service returns 201 before inserting the log in the DB. # This request should only error if the input is incorrect (status # code 422) or if the service is unreachable. if resp.status_code != 201: try: err = resp.json() except Exception: err = resp.text self.logger.error( f"Unable to POST audit log `{body}`. Details:\n{err}") raise InternalError("Unable to create audit log")
def assume_role(cls, bucket_cred, expires_in, aws_creds_config, boto=None): """ Args: bucket_cred expires_in aws_creds_config boto (optional): provide `boto` when calling this function outside of application context, to avoid errors when using `flask.current_app`. """ boto = boto or flask.current_app.boto role_arn = get_value( bucket_cred, "role-arn", InternalError("role-arn of that bucket is missing")) assumed_role = boto.assume_role(role_arn, expires_in, aws_creds_config) cred = get_value(assumed_role, "Credentials", InternalError("fail to assume role")) return { "aws_access_key_id": get_value( cred, "AccessKeyId", InternalError("outdated format. AccessKeyId missing"), ), "aws_secret_access_key": get_value( cred, "SecretAccessKey", InternalError("outdated format. SecretAccessKey missing"), ), "aws_session_token": get_value( cred, "SessionToken", InternalError("outdated format. Sesssion token missing"), ), }
def bucket_name(self): """ Return: Optional[str]: bucket name or None if not not in cofig """ s3_buckets = get_value( flask.current_app.config, "S3_BUCKETS", InternalError("buckets not configured"), ) for bucket in s3_buckets: if re.match("^" + bucket + "$", self.parsed_url.netloc): return bucket return None
def __init__(self, credentials, logger): self.logger = logger self.clients = {} for provider, config in credentials.items(): if "backend" not in config: self.logger.error( "Storage provider {} is not configured with backend". format(provider)) raise InternalError("Something went wrong") backend = config["backend"] creds = copy.deepcopy(config) del creds["backend"] self.clients[provider] = get_client(config=config, backend=backend)
def get_bucket_region(self): s3_buckets = get_value(config, "S3_BUCKETS", InternalError("buckets not configured")) if len(s3_buckets) == 0: return None bucket_cred = s3_buckets.get(self.bucket_name()) if bucket_cred is None: return None if "region" not in bucket_cred: return None else: return bucket_cred["region"]
def get_signed_url(self, action, expires_in, public_data=False): aws_creds = get_value( flask.current_app.config, "AWS_CREDENTIALS", InternalError("credentials not configured"), ) http_url = "https://{}.s3.amazonaws.com/{}".format( self.parsed_url.netloc, self.parsed_url.path.strip("/")) config = self.get_credential_to_access_bucket(aws_creds, expires_in) aws_access_key_id = get_value( config, "aws_access_key_id", InternalError("aws configuration not found")) if aws_access_key_id == "*": return http_url region = flask.current_app.boto.get_bucket_region( self.parsed_url.netloc, config) user_info = {} if not public_data: user_info = S3IndexedFileLocation.get_user_info() url = generate_aws_presigned_url( http_url, ACTION_DICT["s3"][action], config, "s3", region, expires_in, user_info, ) return url
def get_bucket_region(self, bucket, config): try: if "aws_access_key_id" in config: self.s3_client = client("s3", **config) response = self.s3_client.get_bucket_location(Bucket=bucket) region = response.get("LocationConstraint") except Boto3Error as ex: self.logger.exception(ex) raise InternalError("Fail to get bucket region: {}".format(ex)) except Exception as ex: self.logger.exception(ex) raise UnavailableError("Fail to reach AWS: {}".format(ex)) if region is None: return "us-east-1" return region
def assume_role(self, role_arn, duration_seconds, config=None): try: if config and config.has_key("aws_access_key_id"): self.sts_client = client("sts", **config) session_name_postfix = uuid.uuid4() return self.sts_client.assume_role( RoleArn=role_arn, DurationSeconds=duration_seconds, RoleSessionName="{}-{}".format("gen3", session_name_postfix), ) except Boto3Error as ex: self.logger.exception(ex) raise InternalError("Fail to assume role: {}".format(ex.message)) except Exception as ex: self.logger.exception(ex) raise UnavailableError("Fail to reach AWS: {}".format(ex.message))
def get(self): """ Complete the shibboleth login. """ if "SHIBBOLETH_HEADER" in config: eppn = flask.request.headers.get(config["SHIBBOLETH_HEADER"]) else: raise InternalError("Missing shibboleth header configuration") username = eppn.split("!")[-1] if eppn else None if username: login_user(flask.request, username, IdentityProvider.itrust) if flask.session.get("redirect"): return flask.redirect(flask.session.get("redirect")) return "logged in" else: raise Unauthorized("Please login")
def index_document(self): """ Get the record from indexd for this index. Return: dict: response from indexd (the contents of the record), containing ``guid`` and ``url`` """ index_url = self.indexd.rstrip("/") + "/index/blank/" params = {"uploader": self.uploader, "file_name": self.file_name} # if attempting to set record's authz field, need to pass token # through if self.authz: params["authz"] = self.authz token = get_jwt() auth = None headers = {"Authorization": f"bearer {token}"} logger.info("passing users authorization header to create blank record") else: logger.info("using indexd basic auth to create blank record") auth = (config["INDEXD_USERNAME"], config["INDEXD_PASSWORD"]) headers = {} indexd_response = requests.post( index_url, json=params, headers=headers, auth=auth ) if indexd_response.status_code not in [200, 201]: try: data = indexd_response.json() except ValueError: data = indexd_response.text self.logger.error( "could not create new record in indexd; got response: {}".format(data) ) raise InternalError( "received error from indexd trying to create blank record" ) document = indexd_response.json() guid = document["did"] self.logger.info( "created blank index record with GUID {} for upload".format(guid) ) return document
def init_multipart_upload(self, expires_in): """ Initialize multipart upload Args: expires(int): expiration time Returns: UploadId(str) """ aws_creds = get_value(config, "AWS_CREDENTIALS", InternalError("credentials not configured")) credentials = S3IndexedFileLocation.get_credential_to_access_bucket( self.bucket_name(), aws_creds, expires_in) return multipart_upload.initilize_multipart_upload( self.parsed_url.netloc, self.parsed_url.path.strip("/"), credentials)
def init_multipart_upload(key, expires_in=None): """ Initilize multipart upload given key Args: key(str): object key Returns: uploadId(str) """ try: bucket = flask.current_app.config["DATA_UPLOAD_BUCKET"] except KeyError: raise InternalError( "fence not configured with data upload bucket; can't create signed URL" ) s3_url = "s3://{}/{}".format(bucket, key) return S3IndexedFileLocation(s3_url).init_multipart_upload(expires_in)
def complete_multipart_upload(bucket, key, credentials, uploadId, parts): """ Complete multipart upload. Raise exception if something wrong happens; otherwise success Args: bucket(str): bucket name key(str): object key or `GUID/filename` credentials(dict): aws credentials uploadId(str): upload id of the current upload parts(list(set)): List of part infos [{"Etag": "1234567", "PartNumber": 1}, {"Etag": "4321234", "PartNumber": 2}] Return: None """ session = boto3.Session( aws_access_key_id=credentials["aws_access_key_id"], aws_secret_access_key=credentials["aws_secret_access_key"], aws_session_token=credentials.get("aws_session_token"), ) s3client = session.client("s3") try: retry_call( s3client.complete_multipart_upload, fkwargs={ "Bucket": bucket, "Key": key, "MultipartUpload": {"Parts": parts}, "UploadId": uploadId, }, tries=MAX_TRIES, jitter=10, ) except botocore.exceptions.ClientError as error: logger.error( "Error when completing multiple part upload for object with uuid {}. Detail {}".format( key, error ) ) raise InternalError( "Can not complete multipart upload for {}. Detail {}".format(key, error) )