def _uri_to_url(self, drs_uri: str, access_id: Optional[str] = None) -> str: """ Translate a DRS URI into a DRS URL. All query params included in the DRS URI (eg '{drs_uri}?version=123') will be carried over to the DRS URL. Only hostname-based DRS URIs (drs://<hostname>/<id>) are supported while compact, identifier-based URIs (drs://[provider_code/]namespace:accession) are not. """ parsed = furl(drs_uri) scheme = 'drs' require(parsed.scheme == scheme, f'The URI {drs_uri!r} does not have the {scheme!r} scheme') # "The colon character is not allowed in a hostname-based DRS URI". # https://ga4gh.github.io/data-repository-service-schemas/preview/develop/docs/#_drs_uris # It is worth noting that compact identifier-based URI can be hard to # parse when following RFC3986, with the 'namespace:accession' part # matching either the heir-part or path production depending if the # optional provider code and following slash is included. reject(':' in parsed.netloc or ':' in str(parsed.path), f'The DRS URI {drs_uri!r} is not hostname-based') parsed.scheme = 'https' object_id = one(parsed.path.segments) parsed.path.set(drs_object_url_path(object_id, access_id)) return parsed.url
def __attrs_post_init__(self): validate_uuid_prefix(self.common) assert ':' not in self.common, self.common if self.partition: assert isinstance(self.partition, int), self.partition # Version 4 UUIDs specify fixed bits in the third dash-seperated # group. To ensure that any concatenation of common and # partition_prefix is a valid UUID prefix, we restrict the number of # characters from the concatenation to be within the first # dash-seperated group. reject( len(self.common) + self.partition > 8, 'Invalid common prefix and partition length', self)
def create(cls, req: Requirement) -> Optional['PinnedRequirement']: if req.specifier: op, version = one(req.specs) assert op == '==' return cls(name=req.name.lower(), versions=Versions(version)) elif req.vcs: reject(req.revision is None, 'VCS requirements must carry a specific revision', req) return cls(name=req.name.lower()) elif req.recursive: return None else: raise RequirementError('Unable to handle requirement', req)
def check_bundle_manifest(self): """ Verify bundle manifest contains required files """ missing_files = [] if 'project_0.json' not in self.manifest_entries: missing_files.append('project_0.json') if 'links.json' not in self.manifest_entries: missing_files.append('links.json') reject(bool(missing_files), f'File(s) {missing_files} not found in bundle {self.bundle_fqid}') for file_name, file_content in self.indexed_files.items(): require('describedBy' in file_content, '"describedBy" missing from file', file_name, self.bundle_fqid)
def _parse_staging_area(self) -> Tuple[str, str]: """ Validate and parse the given staging area URL into bucket and path values. Path value will not have a prefix '/' and will have a postfix '/' if not empty. """ split_url = parse.urlsplit(self.args.staging_area) require(split_url.scheme == 'gs' and split_url.netloc, 'Staging area URL must be in gs://<bucket>[/<path>] format') reject(split_url.path.endswith('/'), 'Staging area URL must not end with a "/"') if split_url.path: path = split_url.path.lstrip('/') + '/' else: path = '' return split_url.netloc, path
def _parse_file_id_column(self, file_id: Optional[str]) -> Optional[str]: # The file_id column is present for datasets, but is usually null, may # contain unexpected/unusable values, and NEVER produces usable DRS URLs, # so we avoid parsing the column altogether for datasets. if self.fqid.source.spec.is_snapshot: reject(file_id is None) # TDR stores the complete DRS URI in the file_id column, but we only # index the path component. These requirements prevent mismatches in # the DRS domain, and ensure that changes to the column syntax don't # go undetected. file_id = furl(file_id) require(file_id.scheme == 'drs') require(file_id.netloc == furl(config.tdr_service_url).netloc) return str(file_id.path).strip('/') else: return None
def parse(cls, prefix: str) -> 'Prefix': """ >>> Prefix.parse('aa/1') Prefix(common='aa', partition=1) >>> p = Prefix.parse('a') >>> print(p.partition) None >>> p.effective.partition == config.partition_prefix_length True >>> Prefix.parse('aa/') Traceback (most recent call last): ... azul.RequirementError: ('Prefix source cannot end in a delimiter.', 'aa/', '/') >>> Prefix.parse('8f538f53/1').partition_prefixes() # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... azul.RequirementError: ('Invalid common prefix and partition length', Prefix(common='8f538f53', partition=1)) >>> list(Prefix.parse('8f538f53/0').partition_prefixes()) [''] """ source_delimiter = '/' reject(prefix.endswith(source_delimiter), 'Prefix source cannot end in a delimiter.', prefix, source_delimiter) if prefix == '': entry = '' partition = None else: try: entry, partition = prefix.split(source_delimiter) except ValueError: entry = prefix partition = None if partition: try: partition = int(partition) except ValueError: raise ValueError( 'Partition prefix length must be an integer.', partition) validate_uuid_prefix(entry) return cls(common=entry, partition=partition)
def _parse_gcs_url(self, gcs_url: str) -> Tuple[gcs.Bucket, str]: """ Parse a GCS URL into its Bucket and path components """ split_url = parse.urlsplit(gcs_url) require( split_url.scheme == 'gs' and split_url.netloc, 'Google Cloud Storage URL must be in gs://<bucket>[/<path>] format' ) reject(split_url.path.endswith('/'), 'Google Cloud Storage URL must not end with a "/"') if split_url.path: path = split_url.path.lstrip('/') + '/' else: path = '' bucket = gcs.Bucket(self.gcs, split_url.netloc) return bucket, path
def validate_uuid_prefix(uuid_prefix: str) -> None: """ # The empty string is a valid prefix >>> validate_uuid_prefix('') >>> validate_uuid_prefix('8f53') # A complete UUID is a valid prefix >>> validate_uuid_prefix('8f53d355-b2fa-4bab-a2f2-6852d852d2ec') >>> validate_uuid_prefix('8F53') Traceback (most recent call last): ... azul.uuids.InvalidUUIDPrefixError: '8F53' is not a valid UUID prefix. >>> validate_uuid_prefix('8') >>> validate_uuid_prefix('8f538f53') >>> validate_uuid_prefix('8f538f5-') Traceback (most recent call last): ... azul.RequirementError: UUID prefix ends with an invalid character: 8f538f5- >>> validate_uuid_prefix('8f538f-') Traceback (most recent call last): ... azul.RequirementError: UUID prefix ends with an invalid character: 8f538f- >>> validate_uuid_prefix('8f538f53a') Traceback (most recent call last): ... azul.uuids.InvalidUUIDPrefixError: '8f538f53a' is not a valid UUID prefix. """ valid_uuid_str = '26a8fccd-bbd2-4342-9c19-6ed7c9bb9278' reject(uuid_prefix.endswith('-'), f'UUID prefix ends with an invalid character: {uuid_prefix}') try: validate_uuid(uuid_prefix + valid_uuid_str[len(uuid_prefix):]) except InvalidUUIDError: raise InvalidUUIDPrefixError(uuid_prefix)
def _get_project(self, bundle) -> api.Project: project, *additional_projects = bundle.projects.values() reject(additional_projects, "Azul can currently only handle a single project per bundle") assert isinstance(project, api.Project) return project
def _reversible_join(joiner: str, parts: Iterable[str]): parts = list(parts) reject(any(joiner in part for part in parts), parts) return joiner.join(parts)
def __attrs_post_init__(self): super().__attrs_post_init__() # Most bits in a v4 or v5 UUID are pseudo-random, including the leading # 32 bits but those are followed by a couple of deterministic ones. # For simplicity, we'll limit ourselves to 2 ** 32 leaf partitions. reject(self.prefix_length > 32)
def _parse(cls, spec: str) -> Tuple[str, Prefix]: rest, sep, prefix = spec.rpartition(':') reject(sep == '', 'Invalid source specification', spec) prefix = Prefix.parse(prefix) return rest, prefix
def _parse_range_request_header( self, range_specifier: str ) -> Sequence[Tuple[Optional[int], Optional[int]]]: """ >>> rc = RepositoryController(lambda_context=None, file_url_func=None) >>> rc._parse_range_request_header('bytes=100-200,300-400') [(100, 200), (300, 400)] >>> rc._parse_range_request_header('bytes=-100') [(None, 100)] >>> rc._parse_range_request_header('bytes=100-') [(100, None)] >>> rc._parse_range_request_header('foo=100') [] >>> rc._parse_range_request_header('') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier '' >>> rc._parse_range_request_header('100-200') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier '100-200' >>> rc._parse_range_request_header('bytes=') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier 'bytes=' >>> rc._parse_range_request_header('bytes=100') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier 'bytes=100' >>> rc._parse_range_request_header('bytes=-') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier 'bytes=-' >>> rc._parse_range_request_header('bytes=--') Traceback (most recent call last): ... chalice.app.BadRequestError: BadRequestError: Invalid range specifier 'bytes=--' """ def to_int_or_none(value: str) -> Optional[int]: return None if value == '' else int(value) parsed_ranges = [] try: unit, ranges = range_specifier.split('=') if unit == 'bytes': for range_spec in ranges.split(','): start, end = range_spec.split('-') reject(start == '' and end == '', 'Empty range') parsed_ranges.append( (to_int_or_none(start), to_int_or_none(end))) else: reject(unit == '', 'Empty range unit') except Exception as e: raise BadRequestError( f'Invalid range specifier {range_specifier!r}') from e return parsed_ranges
def __attrs_post_init__(self): reject(self.prefix_length == 0 and self.prefix != 0) require(0 <= self.prefix < 2 ** self.prefix_length)
def to_index(self, value_unit: Optional[JSON]) -> str: """ >>> a = ValueAndUnit() >>> a.to_index({'value': '20', 'unit': 'year'}) '20 year' >>> a.to_index({'value': '20', 'unit': None}) '20' >>> a.to_index(None) '~null' >>> a.to_index({}) Traceback (most recent call last): ... azul.RequirementError: A dictionary with entries for `value` and `unit` is required >>> a.to_index({'value': '1', 'unit': 'day', 'foo': 12}) Traceback (most recent call last): ... azul.RequirementError: A dictionary with exactly two entries is required >>> a.to_index({'unit': 'day'}) Traceback (most recent call last): ... azul.RequirementError: A dictionary with entries for `value` and `unit` is required >>> a.to_index({'value': '1'}) Traceback (most recent call last): ... azul.RequirementError: A dictionary with entries for `value` and `unit` is required >>> a.to_index({'value': '', 'unit': 'year'}) Traceback (most recent call last): ... azul.RequirementError: The `value` entry must not be empty >>> a.to_index({'value': '20', 'unit': ''}) Traceback (most recent call last): ... azul.RequirementError: The `unit` entry must not be empty >>> a.to_index({'value': None, 'unit': 'years'}) Traceback (most recent call last): ... azul.RequirementError: The `value` entry must not be null >>> a.to_index({'value': 20, 'unit': None}) Traceback (most recent call last): ... azul.RequirementError: The `value` entry must be a string >>> a.to_index({'value': '20', 'unit': True}) Traceback (most recent call last): ... azul.RequirementError: The `unit` entry must be a string >>> a.to_index({'value': '20 ', 'unit': None}) Traceback (most recent call last): ... azul.RequirementError: The `value` entry must not contain space characters >>> a.to_index({'value': '20', 'unit': 'years '}) Traceback (most recent call last): ... azul.RequirementError: The `unit` entry must not contain space characters """ if value_unit is None: return NullableString.null_string else: try: value, unit = value_unit['value'], value_unit['unit'] except KeyError: reject( True, 'A dictionary with entries for `value` and `unit` is required' ) else: require( len(value_unit) == 2, 'A dictionary with exactly two entries is required') reject(value == '', 'The `value` entry must not be empty') reject(unit == '', 'The `unit` entry must not be empty') reject(value is None, 'The `value` entry must not be null') require( type(value) is str, 'The `value` entry must be a string') reject(' ' in value, 'The `value` entry must not contain space characters') if unit is None: return value else: require( type(unit) is str, 'The `unit` entry must be a string') reject( ' ' in unit, 'The `unit` entry must not contain space characters') return f'{value} {unit}'
def __attrs_post_init__(self): reject( len(self.id) > 254, 'Terra requires IDs be no longer than 254 chars', )