def backtick(table_name: str) -> str: """ Return the given string surrounded by backticks if deemed necessary based on a simplified interpretation of BigQuery's lexical structure and syntax for identifier tokens. https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical >>> backtick('foo.bar.my_table') 'foo.bar.my_table' >>> backtick('foo2.bar.my_table') 'foo2.bar.my_table' >>> backtick('foo-2.bar.my_table') '`foo-2.bar.my_table`' >>> backtick('foo-2.bar`s.my_table') Traceback (most recent call last): ... azul.RequirementError: foo-2.bar`s.my_table """ if table_name_re.fullmatch(table_name): return table_name else: require('`' not in table_name, table_name) return f'`{table_name}`'
def snapshot_names_by_id(self) -> Dict[str, str]: """ List the TDR snapshots accessible to the current credentials. """ endpoint = self._repository_endpoint('snapshots') snapshots = [] # FIXME: Defend against concurrent changes while listing snapshots # https://github.com/DataBiosphere/azul/issues/3979 while True: response = self._request('GET', endpoint, fields={ 'offset': len(snapshots), 'limit': self.page_size, 'sort': 'created_date', 'direction': 'asc' }) response = self._check_response(endpoint, response) new_snapshots = response['items'] if new_snapshots: snapshots += new_snapshots else: total = response['filteredTotal'] require(len(snapshots) == total, snapshots, total) break return {snapshot['id']: snapshot['name'] for snapshot in snapshots}
def update(self, plugin: RepositoryPlugin) -> None: require(self.replica is None or self.replica == 'gcp') assert self.drs_path is not None drs_uri = plugin.drs_uri(self.drs_path) drs_client = plugin.drs_client() access = drs_client.get_object(drs_uri, access_method=AccessMethod.gs) assert access.headers is None url = furl(access.url) blob_name = '/'.join(url.path.segments) # https://github.com/databiosphere/azul/issues/2479#issuecomment-733410253 if url.fragmentstr: blob_name += '#' + unquote(url.fragmentstr) else: # furl does not differentiate between no fragment and empty # fragment if access.url.endswith('#'): blob_name += '#' blob = self._get_blob(bucket_name=url.netloc, blob_name=blob_name) expiration = int(time.time() + 3600) file_name = self.file_name.replace('"', r'\"') assert all(0x1f < ord(c) < 0x80 for c in file_name) disposition = f"attachment; filename={file_name}" signed_url = blob.generate_signed_url(expiration=expiration, response_disposition=disposition) self._location = signed_url
def verify_source_access(): public_snapshots = set(public_tdr.snapshot_names_by_id()) all_snapshots = set(tdr.snapshot_names_by_id()) diff = public_snapshots - all_snapshots require(not diff, 'The public service account can access snapshots that the indexer ' 'service account cannot', diff)
def _uri_to_url(self, drs_uri: str, access_id: Optional[str] = None) -> str: """ Translate a DRS URI into a DRS URL. All query params included in the DRS URI (eg '{drs_uri}?version=123') will be carried over to the DRS URL. Only hostname-based DRS URIs (drs://<hostname>/<id>) are supported while compact, identifier-based URIs (drs://[provider_code/]namespace:accession) are not. """ parsed = furl(drs_uri) scheme = 'drs' require(parsed.scheme == scheme, f'The URI {drs_uri!r} does not have the {scheme!r} scheme') # "The colon character is not allowed in a hostname-based DRS URI". # https://ga4gh.github.io/data-repository-service-schemas/preview/develop/docs/#_drs_uris # It is worth noting that compact identifier-based URI can be hard to # parse when following RFC3986, with the 'namespace:accession' part # matching either the heir-part or path production depending if the # optional provider code and following slash is included. reject(':' in parsed.netloc or ':' in str(parsed.path), f'The DRS URI {drs_uri!r} is not hostname-based') parsed.scheme = 'https' object_id = one(parsed.path.segments) parsed.path.set(drs_object_url_path(object_id, access_id)) return parsed.url
def _donor(self, donor: api.DonorOrganism) -> MutableJSON: if donor.organism_age is None: require(donor.organism_age_unit is None) organism_age = None else: organism_age = { 'value': donor.organism_age, 'unit': donor.organism_age_unit } return { 'document_id': str(donor.document_id), 'biomaterial_id': donor.biomaterial_id, 'biological_sex': donor.sex, 'genus_species': sorted(donor.genus_species), 'development_stage': donor.development_stage, 'diseases': sorted(donor.diseases), 'organism_age': organism_age, 'organism_age_value': donor.organism_age, 'organism_age_unit': donor.organism_age_unit, **({ 'organism_age_range': { 'gte': donor.organism_age_in_seconds.min, 'lte': donor.organism_age_in_seconds.max } } if donor.organism_age_in_seconds else {}) }
def run(self): pip_deps = self.get_direct_reqs(self.pip) direct_runtime_reqs = self.get_direct_reqs(self.runtime) direct_build_reqs = self.get_direct_reqs(self.build) dupes = direct_build_reqs & direct_runtime_reqs require(not dupes, 'Some requirements are declared as both run and build time', dupes) build_reqs = self.get_reqs(self.build) - pip_deps runtime_reqs = self.get_reqs(self.runtime) - pip_deps require(runtime_reqs <= build_reqs, 'Runtime requirements are not a subset of build requirements', runtime_reqs - build_reqs) overlap = build_reqs & runtime_reqs ambiguities = PinnedRequirements(req for req in overlap if len(req.versions) > 1) for req in ambiguities: build_req = build_reqs[req] log.warning( 'Pinning transitive runtime requirement %s to %s, the ' 'version resolved at build time.', req, build_req.versions) runtime_reqs[req] = build_req build_only_reqs = build_reqs - runtime_reqs transitive_build_reqs = build_only_reqs - direct_build_reqs transitive_runtime_reqs = runtime_reqs - direct_runtime_reqs assert not transitive_build_reqs & transitive_runtime_reqs self.write_transitive_reqs(transitive_build_reqs, self.build) self.write_transitive_reqs(transitive_runtime_reqs, self.runtime)
def parse_stratification(self, line_num: int, species: str, stage: str, organ: str, library: str) -> List[Mapping[str, List[str]]]: """ >>> file = File('foo.txt', '') >>> file.parse_stratification(9, 'human', 'adult', 'blood', '10x') [{'species': ['human'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}] >>> file.parse_stratification(9, 'human, mouse', 'adult', 'blood', '10x') [{'species': ['human', 'mouse'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}] >>> file.parse_stratification(9, 'human, mouse', 'human: adult, mouse: child', 'blood', '10x') [{'species': ['human'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}, \ {'species': ['mouse'], 'stage': ['child'], 'organ': ['blood'], 'library': ['10x']}] >>> file.parse_stratification(9, 'human, mouse', 'human: adult', 'blood', '10x') Traceback (most recent call last): ... azul.RequirementError: Error with row 9 'stage' keys ['human']. >>> file.parse_stratification(9, 'human, mouse', 'human: adult, mouse: child, cat: kitten', 'blood', '10x') Traceback (most recent call last): ... azul.RequirementError: Error with row 9 'stage' keys ['cat', 'human', 'mouse']. """ strats = [{}] pairs = (('species', species), ('stage', stage), ('organ', organ), ('library', library)) for category, value in pairs: if value: parsed = self.parse_strat(value) if None in parsed: # value applies to all assert len(parsed) == 1, parsed for strat in strats: strat[category] = parsed[None] else: # value applies to one # find the dict with a multi-value field we need to split keys = list(parsed.keys()) for strat in strats: for cat, val in strat.items(): if set(keys) == set(val): strat[cat] = [keys.pop(0)] while len(keys) > 0: new_strat = deepcopy(strat) new_strat[cat] = [keys.pop(0)] strats.append(new_strat) # put each value in the appropriate dict keys = set(parsed.keys()) for strat in strats: for k, v in parsed.items(): if [k] in strat.values(): strat[category] = v keys -= {k} require( len(keys) == 0, f'Error with line {line_num} {category!r} keys {sorted(keys)}.' ) return strats
def _find_upstream_bundles(self, source: TDRSourceRef, outputs: Entities) -> Set[SourcedBundleFQID]: """ Search for bundles containing processes that produce the specified output entities. """ output_ids = [output.entity_id for output in outputs] output_id = 'JSON_EXTRACT_SCALAR(link_output, "$.output_id")' rows = self._run_sql(f''' SELECT links_id, version, {output_id} AS output_id FROM {backtick(self._full_table_name(source.spec, 'links'))} AS links JOIN UNNEST(JSON_EXTRACT_ARRAY(links.content, '$.links')) AS content_links ON JSON_EXTRACT_SCALAR(content_links, '$.link_type') = 'process_link' JOIN UNNEST(JSON_EXTRACT_ARRAY(content_links, '$.outputs')) AS link_output ON {output_id} IN UNNEST({output_ids}) ''') bundles = set() outputs_found = set() for row in rows: bundles.add( SourcedBundleFQID(source=source, uuid=row['links_id'], version=self.format_version(row['version']))) outputs_found.add(row['output_id']) missing = set(output_ids) - outputs_found require(not missing, f'Dangling inputs not found in any bundle: {missing}') return bundles
def get_prefix_list(cls, prefix: str = None, start_prefix: str = None): """ Generate ascending hex prefixes. >>> DSSv2Adapter.get_prefix_list(prefix='aa', start_prefix=None) ['aa'] >>> DSSv2Adapter.get_prefix_list(prefix='a', start_prefix='aa') ['aa', 'ab', 'ac', 'ad', 'ae', 'af'] >>> DSSv2Adapter.get_prefix_list(prefix=None, start_prefix='aa') ['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'b', 'c', 'd', 'e', 'f'] >>> DSSv2Adapter.get_prefix_list(prefix=None, start_prefix=None) """ if not start_prefix: return [prefix] if prefix else None elif prefix: require(start_prefix.startswith(prefix), f'Start prefix {start_prefix!r} must begin with prefix {prefix!r}') require(len(start_prefix) > len(prefix), f'Start prefix {start_prefix!r} must be longer than prefix {prefix!r}') start = start_prefix or prefix end = prefix + 'f' if prefix else 'f' prefixes = [start] prev = start while prev != end: if (last_char := prev[-1]) != 'f': prev = prev[:-1] + hex(int(last_char, 16) + 1)[2:] prefixes.append(prev) else: prev = prev[:-1]
def reservoir_sample(k: int, it: Iterable[T], *, random: _random.Random = _random) -> List[T]: """ Return a random choice of a given size from an iterable. https://stackoverflow.com/a/35671225/4171119 >>> r = _random.Random(42) >>> reservoir_sample(5, '', random=r) [] >>> reservoir_sample(5, 'abcd', random=r) ['c', 'b', 'd', 'a'] >>> reservoir_sample(0, 'abcd', random=r) [] >>> reservoir_sample(5, 'abcdefghijklmnopqrstuvwxyz', random=r) ['x', 'l', 'a', 'n', 'b'] """ if k == 0: return [] require(k > 0, 'Sample size must not be negative', k, exception=ValueError) it = iter(it) sample = list(islice(it, k)) random.shuffle(sample) for i, item in enumerate(it, start=k + 1): j = random.randrange(i) if j < k: sample[j] = item return sample
def _get_object(self, drs_uri: str, access_method: AccessMethod) -> Access: url = self._uri_to_url(drs_uri) while True: response = self._request(url) if response.status == 200: # Bundles are not supported therefore we can expect 'access_methods' response = json.loads(response.data) access_methods = response['access_methods'] method = one(m for m in access_methods if m['type'] == access_method.scheme) access_url = method.get('access_url') access_id = method.get('access_id') if access_url is not None and access_id is not None: # TDR quirkily uses the GS access method to provide both a # GS access URL *and* an access ID that produces an HTTPS # signed URL # # https://github.com/ga4gh/data-repository-service-schemas/issues/360 # https://github.com/ga4gh/data-repository-service-schemas/issues/361 require(access_method is AccessMethod.gs, access_method) return self._get_object_access(drs_uri, access_id, AccessMethod.https) elif access_id is not None: return self._get_object_access(drs_uri, access_id, access_method) elif access_url is not None: require(furl(access_url['url']).scheme == access_method.scheme) return Access(method=access_method, url=access_url['url']) else: raise RequirementError("'access_url' and 'access_id' are both missing") elif response.status == 202: wait_time = int(response.headers['retry-after']) time.sleep(wait_time) else: raise DRSError(response)
def format_dcp2_datetime(d: datetime) -> str: """ Convert a tz-aware (UTC) datetime into a '2020-01-01T00:00:00.000000Z' formatted string. >>> from datetime import timezone >>> format_dcp2_datetime(datetime(2020, 12, 31, 23, 59, 59, 1, tzinfo=timezone.utc)) '2020-12-31T23:59:59.000001Z' >>> format_dcp2_datetime(datetime(9999, 1, 1, tzinfo=timezone.utc)) '9999-01-01T00:00:00.000000Z' >>> format_dcp2_datetime(datetime(1, 1, 1, tzinfo=timezone.utc)) '0001-01-01T00:00:00.000000Z' >>> format_dcp2_datetime(datetime(2020, 1, 1)) Traceback (most recent call last): ... azul.RequirementError: 2020-01-01 00:00:00 """ require(str(d.tzinfo) == 'UTC', d) date_string = datetime.strftime(d, dcp2_datetime_format) # Work around https://bugs.python.org/issue13305 date_string = ('0000' + date_string)[-31:] assert date_string.endswith('+0000'), date_string return date_string[:-5] + 'Z'
def in_range(minimum: Optional[N], maximum: Optional[N], type_: Optional[TYPE] = None) -> JSON: """ >>> from azul.doctests import assert_json >>> assert_json(in_range(1, 2)) { "type": "integer", "format": "int64", "minimum": 1, "maximum": 2 } >>> assert_json(in_range(.5, None)) { "type": "number", "format": "double", "minimum": 0.5 } >>> assert_json(in_range(None, 2.0)) { "type": "number", "format": "double", "maximum": 2.0 } >>> assert_json(in_range(minimum=.5, maximum=2)) Traceback (most recent call last): ... azul.RequirementError: ('Mismatched argument types', <class 'float'>, <class 'int'>) >>> assert_json(in_range()) Traceback (most recent call last): ... TypeError: in_range() missing 2 required positional arguments: 'minimum' and 'maximum' >>> assert_json(in_range(None, None)) Traceback (most recent call last): ... azul.RequirementError: Must pass at least one bound """ if type_ is None: types = (type(minimum), type(maximum)) set_of_types = set(types) set_of_types.discard(type(None)) require(bool(set_of_types), 'Must pass at least one bound') require(len(set_of_types) == 1, 'Mismatched argument types', *types) type_ = one(set_of_types) return { **make_type(type_), **({} if minimum is None else { 'minimum': minimum }), **({} if maximum is None else { 'maximum': maximum }) }
def _source_ref_cls(self) -> Type[SOURCE_REF]: cls = type(self) base_cls = one(getattr(cls, '__orig_bases__')) spec_cls, ref_cls = get_args(base_cls) require(issubclass(spec_cls, SourceSpec)) require(issubclass(ref_cls, SourceRef)) assert ref_cls.spec_cls() is spec_cls return ref_cls
def verify_source(self, ref: SOURCE_REF) -> None: """ Verify that the source's ID matches that defined in the repository for the source's spec. """ actual_id = self.lookup_source_id(ref.spec) require(ref.id == actual_id, 'Source ID changed unexpectedly', ref, actual_id)
def handle_notification(self, catalog: CatalogName, action: str, request: Request): hmac.verify(current_request=request) IndexName.validate_catalog_name(catalog, exception=chalice.BadRequestError) require(action in ('add', 'delete'), exception=chalice.BadRequestError) notification = request.json_body log.info('Received notification %r for catalog %r', notification, catalog) self._validate_notification(notification) return self._handle_notification(action, notification, catalog)
def accumulate(self, value): if self.max_size is None or len(self.value) < self.max_size: key = self.key(value) try: old_value = self.value[key] except KeyError: self.value[key] = value else: require(old_value == value, old_value, value)
def as_json(self, keys: Iterable[str]) -> JSON: keys = set(keys) if keys: require(keys.issubset(self.all_keys())) else: keys = self.all_keys() json = {k: getattr(self, k) for k in keys} json['up'] = all(v['up'] for v in json.values()) return json
def parse_stratification(self, points: JSON) -> List[Mapping[str, List[str]]]: """ >>> file = File(name='foo.txt', source='', project_id='1234', row_num=1) >>> file.parse_stratification({'species': 'human', 'organ': 'blood'}) [{'species': ['human'], 'organ': ['blood']}] >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'blood'}) [{'species': ['human', 'mouse'], 'organ': ['blood']}] >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood, mouse: brain'}) [{'species': ['human'], 'organ': ['blood']}, {'species': ['mouse'], 'organ': ['brain']}] >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood'}) Traceback (most recent call last): ... azul.RequirementError: Row 1 'organ' values ['human'] differ from parent dimension. >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood, mouse: brain, cat: brain'}) Traceback (most recent call last): ... azul.RequirementError: Row 1 'organ' values ['cat', 'human', 'mouse'] differ from parent dimension. """ strata = [{}] for dimension, values in points.items(): if values: parsed_values = self.parse_values(values) if None in parsed_values: # Add the value to all stratum assert len(parsed_values) == 1, parsed_values for stratum in strata: stratum[dimension] = parsed_values[None] else: # Each value belongs to a separate stratum. Find the stratum # with the matching multi-value point and split it into # separate stratum. parents = list(parsed_values.keys()) for stratum in strata: for dimension_, values_ in stratum.items(): if set(parents) == set(values_): stratum[dimension_] = [parents.pop(0)] while len(parents) > 0: new_stratum = deepcopy(stratum) new_stratum[dimension_] = [parents.pop(0)] strata.append(new_stratum) # Put each value in its specified stratum parents = set(parsed_values.keys()) for stratum in strata: for parent, values_ in parsed_values.items(): if [parent] in stratum.values(): stratum[dimension] = values_ parents -= {parent} require( len(parents) == 0, f'Row {self.row_num} {dimension!r} values {sorted(parents)} ' 'differ from parent dimension.') return strata
def key_resolver(*, key_id, algorithm): require(algorithm == 'hmac-sha256', algorithm) key, _ = aws.get_hmac_key_and_id_cached(key_id) key = key.encode() # Since HTTPSignatureAuth.verify doesn't return anything we need to # extract the key ID in this round-about way. nonlocal _key_id _key_id = key_id return key
def taggable_resource_types(self) -> Sequence[str]: schema = self.schema.document require(schema['format_version'] == '0.1') resources = chain.from_iterable( schema['provider_schemas'][provider]['resource_schemas'].items() for provider in schema['provider_schemas']) return [ resource_type for resource_type, resource in resources if 'tags' in resource['block']['attributes'] ]
def verify_source(source_spec: TDRSourceSpec): source = tdr.lookup_source(source_spec) require(source.project == source_spec.project, 'Actual Google project of TDR source differs from configured one', source.project, source_spec.project) # Uppercase is standard for multi-regions in the documentation but TDR # returns 'us' in lowercase require(source.location.lower() == config.tdr_source_location.lower(), 'Actual storage location of TDR source differs from configured one', source.location, config.tdr_source_location)
def _retrieve_entities( self, source: TDRSourceSpec, entity_type: EntityType, entity_ids: Union[Set[EntityID], Set[BundleFQID]], ) -> List[BigQueryRow]: """ Efficiently retrieve multiple entities from BigQuery in a single query. :param source: Snapshot containing the entity table :param entity_type: The type of entity, corresponding to the table name :param entity_ids: For links, the fully qualified UUID and version of each `links` entity. For other entities, just the UUIDs. """ pk_column = entity_type + '_id' version_column = 'version' non_pk_columns = (TDRBundle.links_columns if entity_type == 'links' else TDRBundle.data_columns if entity_type.endswith( '_file') else TDRBundle.metadata_columns) assert version_column in non_pk_columns table_name = backtick(self._full_table_name(source, entity_type)) entity_id_type = one(set(map(type, entity_ids))) def quote(s): return f"'{s}'" if entity_type == 'links': assert issubclass(entity_id_type, BundleFQID), entity_id_type entity_ids = cast(Set[BundleFQID], entity_ids) where_columns = (pk_column, version_column) where_values = ((quote(fqid.uuid), f'TIMESTAMP({quote(fqid.version)})') for fqid in entity_ids) expected = {fqid.uuid for fqid in entity_ids} else: assert issubclass(entity_id_type, str), (entity_type, entity_id_type) where_columns = (pk_column, ) where_values = ((quote(entity_id), ) for entity_id in entity_ids) expected = entity_ids query = f''' SELECT {', '.join({pk_column, *non_pk_columns})} FROM {table_name} WHERE {self._in(where_columns, where_values)} ''' log.debug('Retrieving %i entities of type %r ...', len(entity_ids), entity_type) rows = self._query_latest_version(source, query, group_by=pk_column) log.debug('Retrieved %i entities of type %r', len(rows), entity_type) missing = expected - {row[pk_column] for row in rows} require(not missing, f'Required entities not found in {table_name}: {missing}') return rows
def _single_resource(self, resources: ResourcePager) -> Optional[Resource]: resources = list(resources) try: resource, *extras = resources except ValueError: return None else: require(not extras, 'Too many resources in path (should be 0 or 1)', self._reservation_parent_path, resources) return resource
def _parse_links( self, links: JSONs, project: EntityReference) -> Tuple[Entities, Entities, Entities]: """ Collects inputs, outputs, and other entities referenced in the subgraph links. :param links: The "links" property of a links.json file. :param project: The project for the bundle defined by these links. :return: A tuple of (1) a set of all entities found in the links, (2) the subset of those entities that occur as inputs and (3) those that occur as outputs. """ entities = set() inputs = set() outputs = set() entities.add(project) for link in links: link_type = link['link_type'] if link_type == 'process_link': process = EntityReference(entity_type=link['process_type'], entity_id=link['process_id']) entities.add(process) for category in ('input', 'output', 'protocol'): for entity in cast(JSONs, link[category + 's']): entity = EntityReference( entity_id=entity[category + '_id'], entity_type=entity[category + '_type']) entities.add(entity) if category == 'input': inputs.add(entity) elif category == 'output': outputs.add(entity) elif link_type == 'supplementary_file_link': associate = EntityReference( entity_type=link['entity']['entity_type'], entity_id=link['entity']['entity_id']) # For MVP, only project entities can have associated supplementary files. require( associate == project, 'Supplementary file must be associated with the current project', project, associate) for supplementary_file in cast(JSONs, link['files']): entities.add( EntityReference( entity_type='supplementary_file', entity_id=supplementary_file['file_id'])) else: raise RequirementError('Unexpected link_type', link_type) return entities, inputs, outputs
def check_bundle_manifest(self): """ Verify bundle manifest contains required files """ missing_files = [] if 'project_0.json' not in self.manifest_entries: missing_files.append('project_0.json') if 'links.json' not in self.manifest_entries: missing_files.append('links.json') reject(bool(missing_files), f'File(s) {missing_files} not found in bundle {self.bundle_fqid}') for file_name, file_content in self.indexed_files.items(): require('describedBy' in file_content, '"describedBy" missing from file', file_name, self.bundle_fqid)
def _parse_staging_area(self) -> Tuple[str, str]: """ Validate and parse the given staging area URL into bucket and path values. Path value will not have a prefix '/' and will have a postfix '/' if not empty. """ split_url = parse.urlsplit(self.args.staging_area) require(split_url.scheme == 'gs' and split_url.netloc, 'Staging area URL must be in gs://<bucket>[/<path>] format') reject(split_url.path.endswith('/'), 'Staging area URL must not end with a "/"') if split_url.path: path = split_url.path.lstrip('/') + '/' else: path = '' return split_url.netloc, path
def decrement(self, value, timeout=None): require(isinstance(value, int)) self.condition.acquire() try: self.value -= value if self.value > 0: while True: if self.condition.wait(timeout=timeout): if self.value <= 0: break else: raise TimeoutError else: self.condition.notifyAll() finally: self.condition.release()
def _parse_file_id_column(self, file_id: Optional[str]) -> Optional[str]: # The file_id column is present for datasets, but is usually null, may # contain unexpected/unusable values, and NEVER produces usable DRS URLs, # so we avoid parsing the column altogether for datasets. if self.fqid.source.spec.is_snapshot: reject(file_id is None) # TDR stores the complete DRS URI in the file_id column, but we only # index the path component. These requirements prevent mismatches in # the DRS domain, and ensure that changes to the column syntax don't # go undetected. file_id = furl(file_id) require(file_id.scheme == 'drs') require(file_id.netloc == furl(config.tdr_service_url).netloc) return str(file_id.path).strip('/') else: return None