Beispiel #1
0
def backtick(table_name: str) -> str:
    """
    Return the given string surrounded by backticks if deemed necessary based
    on a simplified interpretation of BigQuery's lexical structure and syntax
    for identifier tokens.

    https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical

    >>> backtick('foo.bar.my_table')
    'foo.bar.my_table'

    >>> backtick('foo2.bar.my_table')
    'foo2.bar.my_table'

    >>> backtick('foo-2.bar.my_table')
    '`foo-2.bar.my_table`'

    >>> backtick('foo-2.bar`s.my_table')
    Traceback (most recent call last):
    ...
    azul.RequirementError: foo-2.bar`s.my_table
    """
    if table_name_re.fullmatch(table_name):
        return table_name
    else:
        require('`' not in table_name, table_name)
        return f'`{table_name}`'
Beispiel #2
0
 def snapshot_names_by_id(self) -> Dict[str, str]:
     """
     List the TDR snapshots accessible to the current credentials.
     """
     endpoint = self._repository_endpoint('snapshots')
     snapshots = []
     # FIXME: Defend against concurrent changes while listing snapshots
     #        https://github.com/DataBiosphere/azul/issues/3979
     while True:
         response = self._request('GET',
                                  endpoint,
                                  fields={
                                      'offset': len(snapshots),
                                      'limit': self.page_size,
                                      'sort': 'created_date',
                                      'direction': 'asc'
                                  })
         response = self._check_response(endpoint, response)
         new_snapshots = response['items']
         if new_snapshots:
             snapshots += new_snapshots
         else:
             total = response['filteredTotal']
             require(len(snapshots) == total, snapshots, total)
             break
     return {snapshot['id']: snapshot['name'] for snapshot in snapshots}
Beispiel #3
0
 def update(self, plugin: RepositoryPlugin) -> None:
     require(self.replica is None or self.replica == 'gcp')
     assert self.drs_path is not None
     drs_uri = plugin.drs_uri(self.drs_path)
     drs_client = plugin.drs_client()
     access = drs_client.get_object(drs_uri, access_method=AccessMethod.gs)
     assert access.headers is None
     url = furl(access.url)
     blob_name = '/'.join(url.path.segments)
     # https://github.com/databiosphere/azul/issues/2479#issuecomment-733410253
     if url.fragmentstr:
         blob_name += '#' + unquote(url.fragmentstr)
     else:
         # furl does not differentiate between no fragment and empty
         # fragment
         if access.url.endswith('#'):
             blob_name += '#'
     blob = self._get_blob(bucket_name=url.netloc, blob_name=blob_name)
     expiration = int(time.time() + 3600)
     file_name = self.file_name.replace('"', r'\"')
     assert all(0x1f < ord(c) < 0x80 for c in file_name)
     disposition = f"attachment; filename={file_name}"
     signed_url = blob.generate_signed_url(expiration=expiration,
                                           response_disposition=disposition)
     self._location = signed_url
Beispiel #4
0
def verify_source_access():
    public_snapshots = set(public_tdr.snapshot_names_by_id())
    all_snapshots = set(tdr.snapshot_names_by_id())
    diff = public_snapshots - all_snapshots
    require(not diff,
            'The public service account can access snapshots that the indexer '
            'service account cannot', diff)
Beispiel #5
0
 def _uri_to_url(self,
                 drs_uri: str,
                 access_id: Optional[str] = None) -> str:
     """
     Translate a DRS URI into a DRS URL. All query params included in the DRS
     URI (eg '{drs_uri}?version=123') will be carried over to the DRS URL.
     Only hostname-based DRS URIs (drs://<hostname>/<id>) are supported while
     compact, identifier-based URIs (drs://[provider_code/]namespace:accession)
     are not.
     """
     parsed = furl(drs_uri)
     scheme = 'drs'
     require(parsed.scheme == scheme,
             f'The URI {drs_uri!r} does not have the {scheme!r} scheme')
     # "The colon character is not allowed in a hostname-based DRS URI".
     # https://ga4gh.github.io/data-repository-service-schemas/preview/develop/docs/#_drs_uris
     # It is worth noting that compact identifier-based URI can be hard to
     # parse when following RFC3986, with the 'namespace:accession' part
     # matching either the heir-part or path production depending if the
     # optional provider code and following slash is included.
     reject(':' in parsed.netloc or ':' in str(parsed.path),
            f'The DRS URI {drs_uri!r} is not hostname-based')
     parsed.scheme = 'https'
     object_id = one(parsed.path.segments)
     parsed.path.set(drs_object_url_path(object_id, access_id))
     return parsed.url
Beispiel #6
0
 def _donor(self, donor: api.DonorOrganism) -> MutableJSON:
     if donor.organism_age is None:
         require(donor.organism_age_unit is None)
         organism_age = None
     else:
         organism_age = {
             'value': donor.organism_age,
             'unit': donor.organism_age_unit
         }
     return {
         'document_id':
         str(donor.document_id),
         'biomaterial_id':
         donor.biomaterial_id,
         'biological_sex':
         donor.sex,
         'genus_species':
         sorted(donor.genus_species),
         'development_stage':
         donor.development_stage,
         'diseases':
         sorted(donor.diseases),
         'organism_age':
         organism_age,
         'organism_age_value':
         donor.organism_age,
         'organism_age_unit':
         donor.organism_age_unit,
         **({
             'organism_age_range': {
                 'gte': donor.organism_age_in_seconds.min,
                 'lte': donor.organism_age_in_seconds.max
             }
         } if donor.organism_age_in_seconds else {})
     }
Beispiel #7
0
    def run(self):
        pip_deps = self.get_direct_reqs(self.pip)
        direct_runtime_reqs = self.get_direct_reqs(self.runtime)
        direct_build_reqs = self.get_direct_reqs(self.build)
        dupes = direct_build_reqs & direct_runtime_reqs
        require(not dupes,
                'Some requirements are declared as both run and build time',
                dupes)

        build_reqs = self.get_reqs(self.build) - pip_deps
        runtime_reqs = self.get_reqs(self.runtime) - pip_deps
        require(runtime_reqs <= build_reqs,
                'Runtime requirements are not a subset of build requirements',
                runtime_reqs - build_reqs)
        overlap = build_reqs & runtime_reqs
        ambiguities = PinnedRequirements(req for req in overlap
                                         if len(req.versions) > 1)
        for req in ambiguities:
            build_req = build_reqs[req]
            log.warning(
                'Pinning transitive runtime requirement %s to %s, the '
                'version resolved at build time.', req, build_req.versions)
            runtime_reqs[req] = build_req

        build_only_reqs = build_reqs - runtime_reqs
        transitive_build_reqs = build_only_reqs - direct_build_reqs
        transitive_runtime_reqs = runtime_reqs - direct_runtime_reqs
        assert not transitive_build_reqs & transitive_runtime_reqs
        self.write_transitive_reqs(transitive_build_reqs, self.build)
        self.write_transitive_reqs(transitive_runtime_reqs, self.runtime)
Beispiel #8
0
    def parse_stratification(self, line_num: int, species: str, stage: str,
                             organ: str,
                             library: str) -> List[Mapping[str, List[str]]]:
        """
        >>> file = File('foo.txt', '')
        >>> file.parse_stratification(9, 'human', 'adult', 'blood', '10x')
        [{'species': ['human'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}]

        >>> file.parse_stratification(9, 'human, mouse', 'adult', 'blood', '10x')
        [{'species': ['human', 'mouse'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}]

        >>> file.parse_stratification(9, 'human, mouse', 'human: adult, mouse: child', 'blood', '10x')
        [{'species': ['human'], 'stage': ['adult'], 'organ': ['blood'], 'library': ['10x']}, \
{'species': ['mouse'], 'stage': ['child'], 'organ': ['blood'], 'library': ['10x']}]

        >>> file.parse_stratification(9, 'human, mouse', 'human: adult', 'blood', '10x')
        Traceback (most recent call last):
        ...
        azul.RequirementError: Error with row 9 'stage' keys ['human'].

        >>> file.parse_stratification(9, 'human, mouse', 'human: adult, mouse: child, cat: kitten', 'blood', '10x')
        Traceback (most recent call last):
        ...
        azul.RequirementError: Error with row 9 'stage' keys ['cat', 'human', 'mouse'].
        """
        strats = [{}]
        pairs = (('species', species), ('stage', stage), ('organ', organ),
                 ('library', library))
        for category, value in pairs:
            if value:
                parsed = self.parse_strat(value)
                if None in parsed:
                    # value applies to all
                    assert len(parsed) == 1, parsed
                    for strat in strats:
                        strat[category] = parsed[None]
                else:
                    # value applies to one
                    # find the dict with a multi-value field we need to split
                    keys = list(parsed.keys())
                    for strat in strats:
                        for cat, val in strat.items():
                            if set(keys) == set(val):
                                strat[cat] = [keys.pop(0)]
                                while len(keys) > 0:
                                    new_strat = deepcopy(strat)
                                    new_strat[cat] = [keys.pop(0)]
                                    strats.append(new_strat)
                    # put each value in the appropriate dict
                    keys = set(parsed.keys())
                    for strat in strats:
                        for k, v in parsed.items():
                            if [k] in strat.values():
                                strat[category] = v
                                keys -= {k}
                    require(
                        len(keys) == 0,
                        f'Error with line {line_num} {category!r} keys {sorted(keys)}.'
                    )
        return strats
Beispiel #9
0
 def _find_upstream_bundles(self, source: TDRSourceRef,
                            outputs: Entities) -> Set[SourcedBundleFQID]:
     """
     Search for bundles containing processes that produce the specified output
     entities.
     """
     output_ids = [output.entity_id for output in outputs]
     output_id = 'JSON_EXTRACT_SCALAR(link_output, "$.output_id")'
     rows = self._run_sql(f'''
         SELECT links_id, version, {output_id} AS output_id
         FROM {backtick(self._full_table_name(source.spec, 'links'))} AS links
             JOIN UNNEST(JSON_EXTRACT_ARRAY(links.content, '$.links')) AS content_links
                 ON JSON_EXTRACT_SCALAR(content_links, '$.link_type') = 'process_link'
             JOIN UNNEST(JSON_EXTRACT_ARRAY(content_links, '$.outputs')) AS link_output
                 ON {output_id} IN UNNEST({output_ids})
     ''')
     bundles = set()
     outputs_found = set()
     for row in rows:
         bundles.add(
             SourcedBundleFQID(source=source,
                               uuid=row['links_id'],
                               version=self.format_version(row['version'])))
         outputs_found.add(row['output_id'])
     missing = set(output_ids) - outputs_found
     require(not missing,
             f'Dangling inputs not found in any bundle: {missing}')
     return bundles
Beispiel #10
0
    def get_prefix_list(cls, prefix: str = None, start_prefix: str = None):
        """
        Generate ascending hex prefixes.

        >>> DSSv2Adapter.get_prefix_list(prefix='aa', start_prefix=None)
        ['aa']

        >>> DSSv2Adapter.get_prefix_list(prefix='a', start_prefix='aa')
        ['aa', 'ab', 'ac', 'ad', 'ae', 'af']

        >>> DSSv2Adapter.get_prefix_list(prefix=None, start_prefix='aa')
        ['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'b', 'c', 'd', 'e', 'f']

        >>> DSSv2Adapter.get_prefix_list(prefix=None, start_prefix=None)
        """
        if not start_prefix:
            return [prefix] if prefix else None
        elif prefix:
            require(start_prefix.startswith(prefix),
                    f'Start prefix {start_prefix!r} must begin with prefix {prefix!r}')
            require(len(start_prefix) > len(prefix),
                    f'Start prefix {start_prefix!r} must be longer than prefix {prefix!r}')
        start = start_prefix or prefix
        end = prefix + 'f' if prefix else 'f'
        prefixes = [start]
        prev = start
        while prev != end:
            if (last_char := prev[-1]) != 'f':
                prev = prev[:-1] + hex(int(last_char, 16) + 1)[2:]
                prefixes.append(prev)
            else:
                prev = prev[:-1]
Beispiel #11
0
def reservoir_sample(k: int,
                     it: Iterable[T],
                     *,
                     random: _random.Random = _random) -> List[T]:
    """
    Return a random choice of a given size from an iterable.

    https://stackoverflow.com/a/35671225/4171119

    >>> r = _random.Random(42)

    >>> reservoir_sample(5, '', random=r)
    []

    >>> reservoir_sample(5, 'abcd', random=r)
    ['c', 'b', 'd', 'a']

    >>> reservoir_sample(0, 'abcd', random=r)
    []

    >>> reservoir_sample(5, 'abcdefghijklmnopqrstuvwxyz', random=r)
    ['x', 'l', 'a', 'n', 'b']
    """
    if k == 0:
        return []
    require(k > 0, 'Sample size must not be negative', k, exception=ValueError)
    it = iter(it)
    sample = list(islice(it, k))
    random.shuffle(sample)
    for i, item in enumerate(it, start=k + 1):
        j = random.randrange(i)
        if j < k:
            sample[j] = item
    return sample
Beispiel #12
0
 def _get_object(self, drs_uri: str, access_method: AccessMethod) -> Access:
     url = self._uri_to_url(drs_uri)
     while True:
         response = self._request(url)
         if response.status == 200:
             # Bundles are not supported therefore we can expect 'access_methods'
             response = json.loads(response.data)
             access_methods = response['access_methods']
             method = one(m for m in access_methods if m['type'] == access_method.scheme)
             access_url = method.get('access_url')
             access_id = method.get('access_id')
             if access_url is not None and access_id is not None:
                 # TDR quirkily uses the GS access method to provide both a
                 # GS access URL *and* an access ID that produces an HTTPS
                 # signed URL
                 #
                 # https://github.com/ga4gh/data-repository-service-schemas/issues/360
                 # https://github.com/ga4gh/data-repository-service-schemas/issues/361
                 require(access_method is AccessMethod.gs, access_method)
                 return self._get_object_access(drs_uri, access_id, AccessMethod.https)
             elif access_id is not None:
                 return self._get_object_access(drs_uri, access_id, access_method)
             elif access_url is not None:
                 require(furl(access_url['url']).scheme == access_method.scheme)
                 return Access(method=access_method,
                               url=access_url['url'])
             else:
                 raise RequirementError("'access_url' and 'access_id' are both missing")
         elif response.status == 202:
             wait_time = int(response.headers['retry-after'])
             time.sleep(wait_time)
         else:
             raise DRSError(response)
Beispiel #13
0
def format_dcp2_datetime(d: datetime) -> str:
    """
    Convert a tz-aware (UTC) datetime into a '2020-01-01T00:00:00.000000Z'
    formatted string.

    >>> from datetime import timezone
    >>> format_dcp2_datetime(datetime(2020, 12, 31, 23, 59, 59, 1, tzinfo=timezone.utc))
    '2020-12-31T23:59:59.000001Z'

    >>> format_dcp2_datetime(datetime(9999, 1, 1, tzinfo=timezone.utc))
    '9999-01-01T00:00:00.000000Z'

    >>> format_dcp2_datetime(datetime(1, 1, 1, tzinfo=timezone.utc))
    '0001-01-01T00:00:00.000000Z'

    >>> format_dcp2_datetime(datetime(2020, 1, 1))
    Traceback (most recent call last):
    ...
    azul.RequirementError: 2020-01-01 00:00:00
    """
    require(str(d.tzinfo) == 'UTC', d)
    date_string = datetime.strftime(d, dcp2_datetime_format)
    # Work around https://bugs.python.org/issue13305
    date_string = ('0000' + date_string)[-31:]
    assert date_string.endswith('+0000'), date_string
    return date_string[:-5] + 'Z'
Beispiel #14
0
def in_range(minimum: Optional[N],
             maximum: Optional[N],
             type_: Optional[TYPE] = None) -> JSON:
    """
    >>> from azul.doctests import assert_json

    >>> assert_json(in_range(1, 2))
    {
        "type": "integer",
        "format": "int64",
        "minimum": 1,
        "maximum": 2
    }

    >>> assert_json(in_range(.5, None))
    {
        "type": "number",
        "format": "double",
        "minimum": 0.5
    }

    >>> assert_json(in_range(None, 2.0))
    {
        "type": "number",
        "format": "double",
        "maximum": 2.0
    }

    >>> assert_json(in_range(minimum=.5, maximum=2))
    Traceback (most recent call last):
    ...
    azul.RequirementError: ('Mismatched argument types', <class 'float'>, <class 'int'>)

    >>> assert_json(in_range())
    Traceback (most recent call last):
    ...
    TypeError: in_range() missing 2 required positional arguments: 'minimum' and 'maximum'

    >>> assert_json(in_range(None, None))
    Traceback (most recent call last):
    ...
    azul.RequirementError: Must pass at least one bound
    """
    if type_ is None:
        types = (type(minimum), type(maximum))
        set_of_types = set(types)
        set_of_types.discard(type(None))
        require(bool(set_of_types), 'Must pass at least one bound')
        require(len(set_of_types) == 1, 'Mismatched argument types', *types)
        type_ = one(set_of_types)
    return {
        **make_type(type_),
        **({} if minimum is None else {
               'minimum': minimum
           }),
        **({} if maximum is None else {
               'maximum': maximum
           })
    }
Beispiel #15
0
 def _source_ref_cls(self) -> Type[SOURCE_REF]:
     cls = type(self)
     base_cls = one(getattr(cls, '__orig_bases__'))
     spec_cls, ref_cls = get_args(base_cls)
     require(issubclass(spec_cls, SourceSpec))
     require(issubclass(ref_cls, SourceRef))
     assert ref_cls.spec_cls() is spec_cls
     return ref_cls
Beispiel #16
0
 def verify_source(self, ref: SOURCE_REF) -> None:
     """
     Verify that the source's ID matches that defined in the
     repository for the source's spec.
     """
     actual_id = self.lookup_source_id(ref.spec)
     require(ref.id == actual_id, 'Source ID changed unexpectedly', ref,
             actual_id)
Beispiel #17
0
 def handle_notification(self, catalog: CatalogName, action: str, request: Request):
     hmac.verify(current_request=request)
     IndexName.validate_catalog_name(catalog, exception=chalice.BadRequestError)
     require(action in ('add', 'delete'), exception=chalice.BadRequestError)
     notification = request.json_body
     log.info('Received notification %r for catalog %r', notification, catalog)
     self._validate_notification(notification)
     return self._handle_notification(action, notification, catalog)
Beispiel #18
0
 def accumulate(self, value):
     if self.max_size is None or len(self.value) < self.max_size:
         key = self.key(value)
         try:
             old_value = self.value[key]
         except KeyError:
             self.value[key] = value
         else:
             require(old_value == value, old_value, value)
Beispiel #19
0
 def as_json(self, keys: Iterable[str]) -> JSON:
     keys = set(keys)
     if keys:
         require(keys.issubset(self.all_keys()))
     else:
         keys = self.all_keys()
     json = {k: getattr(self, k) for k in keys}
     json['up'] = all(v['up'] for v in json.values())
     return json
Beispiel #20
0
    def parse_stratification(self,
                             points: JSON) -> List[Mapping[str, List[str]]]:
        """
        >>> file = File(name='foo.txt', source='', project_id='1234', row_num=1)
        >>> file.parse_stratification({'species': 'human', 'organ': 'blood'})
        [{'species': ['human'], 'organ': ['blood']}]

        >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'blood'})
        [{'species': ['human', 'mouse'], 'organ': ['blood']}]

        >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood, mouse: brain'})
        [{'species': ['human'], 'organ': ['blood']}, {'species': ['mouse'], 'organ': ['brain']}]

        >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood'})
        Traceback (most recent call last):
        ...
        azul.RequirementError: Row 1 'organ' values ['human'] differ from parent dimension.

        >>> file.parse_stratification({'species': 'human, mouse', 'organ': 'human: blood, mouse: brain, cat: brain'})
        Traceback (most recent call last):
        ...
        azul.RequirementError: Row 1 'organ' values ['cat', 'human', 'mouse'] differ from parent dimension.
        """
        strata = [{}]
        for dimension, values in points.items():
            if values:
                parsed_values = self.parse_values(values)
                if None in parsed_values:
                    # Add the value to all stratum
                    assert len(parsed_values) == 1, parsed_values
                    for stratum in strata:
                        stratum[dimension] = parsed_values[None]
                else:
                    # Each value belongs to a separate stratum. Find the stratum
                    # with the matching multi-value point and split it into
                    # separate stratum.
                    parents = list(parsed_values.keys())
                    for stratum in strata:
                        for dimension_, values_ in stratum.items():
                            if set(parents) == set(values_):
                                stratum[dimension_] = [parents.pop(0)]
                                while len(parents) > 0:
                                    new_stratum = deepcopy(stratum)
                                    new_stratum[dimension_] = [parents.pop(0)]
                                    strata.append(new_stratum)
                    # Put each value in its specified stratum
                    parents = set(parsed_values.keys())
                    for stratum in strata:
                        for parent, values_ in parsed_values.items():
                            if [parent] in stratum.values():
                                stratum[dimension] = values_
                                parents -= {parent}
                    require(
                        len(parents) == 0,
                        f'Row {self.row_num} {dimension!r} values {sorted(parents)} '
                        'differ from parent dimension.')
        return strata
Beispiel #21
0
 def key_resolver(*, key_id, algorithm):
     require(algorithm == 'hmac-sha256', algorithm)
     key, _ = aws.get_hmac_key_and_id_cached(key_id)
     key = key.encode()
     # Since HTTPSignatureAuth.verify doesn't return anything we need to
     # extract the key ID in this round-about way.
     nonlocal _key_id
     _key_id = key_id
     return key
Beispiel #22
0
 def taggable_resource_types(self) -> Sequence[str]:
     schema = self.schema.document
     require(schema['format_version'] == '0.1')
     resources = chain.from_iterable(
         schema['provider_schemas'][provider]['resource_schemas'].items()
         for provider in schema['provider_schemas'])
     return [
         resource_type for resource_type, resource in resources
         if 'tags' in resource['block']['attributes']
     ]
Beispiel #23
0
def verify_source(source_spec: TDRSourceSpec):
    source = tdr.lookup_source(source_spec)
    require(source.project == source_spec.project,
            'Actual Google project of TDR source differs from configured one',
            source.project, source_spec.project)
    # Uppercase is standard for multi-regions in the documentation but TDR
    # returns 'us' in lowercase
    require(source.location.lower() == config.tdr_source_location.lower(),
            'Actual storage location of TDR source differs from configured one',
            source.location, config.tdr_source_location)
Beispiel #24
0
    def _retrieve_entities(
        self,
        source: TDRSourceSpec,
        entity_type: EntityType,
        entity_ids: Union[Set[EntityID], Set[BundleFQID]],
    ) -> List[BigQueryRow]:
        """
        Efficiently retrieve multiple entities from BigQuery in a single query.

        :param source: Snapshot containing the entity table

        :param entity_type: The type of entity, corresponding to the table name

        :param entity_ids: For links, the fully qualified UUID and version of
                           each `links` entity. For other entities, just the UUIDs.
        """
        pk_column = entity_type + '_id'
        version_column = 'version'
        non_pk_columns = (TDRBundle.links_columns if entity_type == 'links'
                          else TDRBundle.data_columns if entity_type.endswith(
                              '_file') else TDRBundle.metadata_columns)
        assert version_column in non_pk_columns
        table_name = backtick(self._full_table_name(source, entity_type))
        entity_id_type = one(set(map(type, entity_ids)))

        def quote(s):
            return f"'{s}'"

        if entity_type == 'links':
            assert issubclass(entity_id_type, BundleFQID), entity_id_type
            entity_ids = cast(Set[BundleFQID], entity_ids)
            where_columns = (pk_column, version_column)
            where_values = ((quote(fqid.uuid),
                             f'TIMESTAMP({quote(fqid.version)})')
                            for fqid in entity_ids)
            expected = {fqid.uuid for fqid in entity_ids}
        else:
            assert issubclass(entity_id_type,
                              str), (entity_type, entity_id_type)
            where_columns = (pk_column, )
            where_values = ((quote(entity_id), ) for entity_id in entity_ids)
            expected = entity_ids
        query = f'''
            SELECT {', '.join({pk_column, *non_pk_columns})}
            FROM {table_name}
            WHERE {self._in(where_columns, where_values)}
        '''
        log.debug('Retrieving %i entities of type %r ...', len(entity_ids),
                  entity_type)
        rows = self._query_latest_version(source, query, group_by=pk_column)
        log.debug('Retrieved %i entities of type %r', len(rows), entity_type)
        missing = expected - {row[pk_column] for row in rows}
        require(not missing,
                f'Required entities not found in {table_name}: {missing}')
        return rows
Beispiel #25
0
 def _single_resource(self, resources: ResourcePager) -> Optional[Resource]:
     resources = list(resources)
     try:
         resource, *extras = resources
     except ValueError:
         return None
     else:
         require(not extras,
                 'Too many resources in path (should be 0 or 1)',
                 self._reservation_parent_path, resources)
         return resource
Beispiel #26
0
    def _parse_links(
            self, links: JSONs,
            project: EntityReference) -> Tuple[Entities, Entities, Entities]:
        """
        Collects inputs, outputs, and other entities referenced in the subgraph
        links.

        :param links: The "links" property of a links.json file.

        :param project: The project for the bundle defined by these links.

        :return: A tuple of (1) a set of all entities found in the links, (2)
                 the subset of those entities that occur as inputs and (3)
                 those that occur as outputs.
        """
        entities = set()
        inputs = set()
        outputs = set()
        entities.add(project)
        for link in links:
            link_type = link['link_type']
            if link_type == 'process_link':
                process = EntityReference(entity_type=link['process_type'],
                                          entity_id=link['process_id'])
                entities.add(process)
                for category in ('input', 'output', 'protocol'):
                    for entity in cast(JSONs, link[category + 's']):
                        entity = EntityReference(
                            entity_id=entity[category + '_id'],
                            entity_type=entity[category + '_type'])
                        entities.add(entity)
                        if category == 'input':
                            inputs.add(entity)
                        elif category == 'output':
                            outputs.add(entity)
            elif link_type == 'supplementary_file_link':
                associate = EntityReference(
                    entity_type=link['entity']['entity_type'],
                    entity_id=link['entity']['entity_id'])
                # For MVP, only project entities can have associated supplementary files.
                require(
                    associate == project,
                    'Supplementary file must be associated with the current project',
                    project, associate)
                for supplementary_file in cast(JSONs, link['files']):
                    entities.add(
                        EntityReference(
                            entity_type='supplementary_file',
                            entity_id=supplementary_file['file_id']))
            else:
                raise RequirementError('Unexpected link_type', link_type)
        return entities, inputs, outputs
Beispiel #27
0
 def check_bundle_manifest(self):
     """
     Verify bundle manifest contains required files
     """
     missing_files = []
     if 'project_0.json' not in self.manifest_entries:
         missing_files.append('project_0.json')
     if 'links.json' not in self.manifest_entries:
         missing_files.append('links.json')
     reject(bool(missing_files),
            f'File(s) {missing_files} not found in bundle {self.bundle_fqid}')
     for file_name, file_content in self.indexed_files.items():
         require('describedBy' in file_content,
                 '"describedBy" missing from file', file_name, self.bundle_fqid)
Beispiel #28
0
 def _parse_staging_area(self) -> Tuple[str, str]:
     """
     Validate and parse the given staging area URL into bucket and path values.
     Path value will not have a prefix '/' and will have a postfix '/' if not empty.
     """
     split_url = parse.urlsplit(self.args.staging_area)
     require(split_url.scheme == 'gs' and split_url.netloc,
             'Staging area URL must be in gs://<bucket>[/<path>] format')
     reject(split_url.path.endswith('/'),
            'Staging area URL must not end with a "/"')
     if split_url.path:
         path = split_url.path.lstrip('/') + '/'
     else:
         path = ''
     return split_url.netloc, path
Beispiel #29
0
 def decrement(self, value, timeout=None):
     require(isinstance(value, int))
     self.condition.acquire()
     try:
         self.value -= value
         if self.value > 0:
             while True:
                 if self.condition.wait(timeout=timeout):
                     if self.value <= 0:
                         break
                 else:
                     raise TimeoutError
         else:
             self.condition.notifyAll()
     finally:
         self.condition.release()
Beispiel #30
0
 def _parse_file_id_column(self, file_id: Optional[str]) -> Optional[str]:
     # The file_id column is present for datasets, but is usually null, may
     # contain unexpected/unusable values, and NEVER produces usable DRS URLs,
     # so we avoid parsing the column altogether for datasets.
     if self.fqid.source.spec.is_snapshot:
         reject(file_id is None)
         # TDR stores the complete DRS URI in the file_id column, but we only
         # index the path component. These requirements prevent mismatches in
         # the DRS domain, and ensure that changes to the column syntax don't
         # go undetected.
         file_id = furl(file_id)
         require(file_id.scheme == 'drs')
         require(file_id.netloc == furl(config.tdr_service_url).netloc)
         return str(file_id.path).strip('/')
     else:
         return None