Ejemplo n.º 1
0
    def test_remote_reindex(self):
        with patch.dict(
                os.environ,
                dict(AZUL_DSS_QUERY_PREFIX='ff',
                     AZUL_DSS_SOURCE='foo_source:/0')):
            source = DSSSourceRef.for_dss_source(config.dss_source)
            self.index_service.repository_plugin(
                self.catalog)._assert_source(source)
            self._create_mock_queues()
            self.client.remote_reindex(self.catalog, {str(source.spec)})
            notification = one(self._read_queue(self._notifications_queue))
            expected_notification = dict(action='reindex',
                                         catalog='test',
                                         source=source.to_json(),
                                         prefix='')
            self.assertEqual(expected_notification, notification)
            event = [self._mock_sqs_record(notification)]

            bundle_fqids = [
                SourcedBundleFQID(source=source,
                                  uuid='ffa338fe-7554-4b5d-96a2-7df127a7640b',
                                  version='2018-03-28T15:10:23.074974Z')
            ]

            with patch.object(Plugin,
                              'list_bundles',
                              return_value=bundle_fqids):
                self.controller.contribute(event)

            notification = one(self._read_queue(self._notifications_queue))
            expected_source = dict(id=source.id, spec=str(source.spec))
            source = notification['notification']['source']
            self.assertEqual(expected_source, source)
Ejemplo n.º 2
0
 def _find_upstream_bundles(self, source: TDRSourceRef,
                            outputs: Entities) -> Set[SourcedBundleFQID]:
     """
     Search for bundles containing processes that produce the specified output
     entities.
     """
     output_ids = [output.entity_id for output in outputs]
     output_id = 'JSON_EXTRACT_SCALAR(link_output, "$.output_id")'
     rows = self._run_sql(f'''
         SELECT links_id, version, {output_id} AS output_id
         FROM {backtick(self._full_table_name(source.spec, 'links'))} AS links
             JOIN UNNEST(JSON_EXTRACT_ARRAY(links.content, '$.links')) AS content_links
                 ON JSON_EXTRACT_SCALAR(content_links, '$.link_type') = 'process_link'
             JOIN UNNEST(JSON_EXTRACT_ARRAY(content_links, '$.outputs')) AS link_output
                 ON {output_id} IN UNNEST({output_ids})
     ''')
     bundles = set()
     outputs_found = set()
     for row in rows:
         bundles.add(
             SourcedBundleFQID(source=source,
                               uuid=row['links_id'],
                               version=self.format_version(row['version'])))
         outputs_found.add(row['output_id'])
     missing = set(output_ids) - outputs_found
     require(not missing,
             f'Dangling inputs not found in any bundle: {missing}')
     return bundles
Ejemplo n.º 3
0
 def fetch_bundle(self, catalog: CatalogName, source: JSON,
                  bundle_uuid: str, bundle_version: str) -> Bundle:
     plugin = self.repository_plugin(catalog)
     source = plugin.source_from_json(source)
     bundle_fqid = SourcedBundleFQID(source=source,
                                     uuid=bundle_uuid,
                                     version=bundle_version)
     return plugin.fetch_bundle(bundle_fqid)
Ejemplo n.º 4
0
 def list_bundles(self, source: CannedSourceRef, prefix: str) -> List[CannedBundleFQID]:
     self._assert_source(source)
     prefix = source.spec.prefix.common + prefix
     validate_uuid_prefix(prefix)
     log.info('Listing bundles with prefix %r in source %r.', prefix, source)
     bundle_fqids = []
     for link in self.staging_area(source.spec).links.values():
         if link.uuid.startswith(prefix):
             bundle_fqids.append(SourcedBundleFQID(source=source,
                                                   uuid=link.uuid,
                                                   version=link.version))
     log.info('There are %i bundle(s) with prefix %r in source %r.',
              len(bundle_fqids), prefix, source)
     return bundle_fqids
Ejemplo n.º 5
0
 def fetch_bundle(self, bundle_fqid: CannedBundleFQID) -> Bundle:
     self._assert_source(bundle_fqid.source)
     now = time.time()
     staging_area = self.staging_area(bundle_fqid.source.spec)
     version, manifest, metadata = staging_area.get_bundle(bundle_fqid.uuid)
     if bundle_fqid.version is None:
         bundle_fqid = SourcedBundleFQID(source=bundle_fqid.source,
                                         uuid=bundle_fqid.uuid,
                                         version=version)
     bundle = CannedBundle(fqid=bundle_fqid,
                           manifest=cast(MutableJSONs, manifest),
                           metadata_files=cast(MutableJSON, metadata))
     assert version == bundle.version, (version, bundle)
     log.info("It took %.003fs to download bundle %s.%s",
              time.time() - now, bundle.uuid, bundle.version)
     return bundle
Ejemplo n.º 6
0
 def list_bundles(self,
                  source: LocalSourceRef,
                  prefix: str
                  ) -> List[SourcedBundleFQID[LocalSourceRef]]:
     source_prefix = source.spec.prefix.common
     validate_uuid_prefix(source_prefix + prefix)
     directory = self.local_path / source.spec.name
     files = directory.glob(f'{source_prefix}{prefix}*{self._manifest_ext}')
     bundle_fqids = []
     for file in files:
         fqid, _, suffix = file.name.rpartition(self._manifest_ext)
         assert suffix == ''
         uuid, version = fqid.split('.', 1)
         bundle_fqids.append(SourcedBundleFQID(uuid=uuid,
                                               version=version,
                                               source=source))
     return bundle_fqids
Ejemplo n.º 7
0
 def test_subgraph_stitching(self, _mock_find_upstream_bundles):
     downstream_uuid = '4426adc5-b3c5-5aab-ab86-51d8ce44dfbe'
     upstream_uuids = [
         'b0c2c714-45ee-4759-a32b-8ccbbcf911d4',
         'bd4939c1-a078-43bd-8477-99ae59ceb555',
     ]
     # TinyQuery/legacy SQL has no support for BQ Arrays, so it's difficult
     # to test the query in this method.
     _mock_find_upstream_bundles.side_effect = [{
         SourcedBundleFQID(source=self.source,
                           uuid=uuid,
                           version='2020-08-10T21:24:26.174274Z')
     } for uuid in upstream_uuids]
     bundle = self._canned_bundle(self.source, downstream_uuid)
     assert any(e['is_stitched'] for e in bundle.manifest)
     self._test_fetch_bundle(bundle, load_tables=True)
     self.assertEqual(_mock_find_upstream_bundles.call_count,
                      len(upstream_uuids))
Ejemplo n.º 8
0
    def _list_links_ids(self, source: TDRSourceRef,
                        prefix: str) -> List[TDRBundleFQID]:

        source_prefix = source.spec.prefix.common
        validate_uuid_prefix(source_prefix + prefix)
        current_bundles = self._query_latest_version(source.spec,
                                                     f'''
            SELECT links_id, version
            FROM {backtick(self._full_table_name(source.spec, 'links'))}
            WHERE STARTS_WITH(links_id, '{source_prefix + prefix}')
        ''',
                                                     group_by='links_id')
        return [
            SourcedBundleFQID(source=source,
                              uuid=row['links_id'],
                              version=self.format_version(row['version']))
            for row in current_bundles
        ]
Ejemplo n.º 9
0
 def list_bundles(self, source: DSSSourceRef,
                  prefix: str) -> List[DSSBundleFQID]:
     self._assert_source(source)
     prefix = source.spec.prefix.common + prefix
     validate_uuid_prefix(prefix)
     log.info('Listing bundles with prefix %r in source %r.', prefix,
              source)
     bundle_fqids = []
     response = self.dss_client.get_bundles_all.iterate(prefix=prefix,
                                                        replica='aws',
                                                        per_page=500)
     for bundle in response:
         bundle_fqids.append(
             SourcedBundleFQID(source=source,
                               uuid=bundle['uuid'],
                               version=bundle['version']))
     log.info('There are %i bundle(s) with prefix %r in source %r.',
              len(bundle_fqids), prefix, source)
     return bundle_fqids
Ejemplo n.º 10
0
def fetch_bundle(source: str, bundle_uuid: str, bundle_version: str) -> Bundle:
    for catalog in config.catalogs:
        plugin = plugin_for(catalog)
        sources = set(map(str, plugin.sources))
        try:
            parsed_source = plugin.resolve_source(source)
        except Exception:
            pass
        else:
            for configured_source in sources:
                configured_source = plugin.resolve_source(configured_source)
                if parsed_source.spec.contains(configured_source.spec):
                    fqid = SourcedBundleFQID(source=configured_source,
                                             uuid=bundle_uuid,
                                             version=bundle_version)
                    bundle = plugin.fetch_bundle(fqid)
                    logger.info('Fetched bundle %r version %r from catalog %r.',
                                fqid.uuid, fqid.version, catalog)
                    return bundle
    raise ValueError('No repository using this source')
Ejemplo n.º 11
0
    def test_contribute_and_aggregate(self):
        """
        Contribution and aggregation of two bundles

        Index two bundles that make contributions to the same project. Inspect
        that the contributions match the tallies that are returned to SQS.
        During aggregation only the project entity is deferred due to
        multiple contributions.
        """
        self.maxDiff = None
        self._create_mock_queues()
        source = DSSSourceRef.for_dss_source('foo_source:/0')
        fqids = [
            SourcedBundleFQID(source=source,
                              uuid='56a338fe-7554-4b5d-96a2-7df127a7640b',
                              version='2018-03-28T15:10:23.074974Z'),
            SourcedBundleFQID(source=source,
                              uuid='b2216048-7eaa-45f4-8077-5a3fb4204953',
                              version='2018-03-29T10:40:41.822717Z')
        ]

        # Load canned bundles
        bundles = {fqid: self._load_canned_bundle(fqid) for fqid in fqids}

        # Synthesize initial notifications
        notifications = [
            dict(action='add',
                 catalog=self.catalog,
                 notification=self.client.synthesize_notification(fqid))
            for fqid in fqids
        ]

        # Invoke the service once to produce a set of expected entities so we
        # don't need to hard-code them. Keep in mind that this test is not
        # intended to cover the service, only the controller.
        expected_digest = defaultdict(list)
        for fqid, bundle in bundles.items():
            contributions = self.index_service.transform(self.catalog,
                                                         bundle,
                                                         delete=False)
            for contribution in contributions:
                assert isinstance(contribution, Contribution)
                # Initially, each entity gets a tally of 1
                expected_digest[contribution.entity.entity_type].append(1)

        # Prove that we have two contributions per "container" type, for when we
        # test poison tallies and deferrals below. Note that the two project
        # contributions are to the same entity, the bundle contributions are not.
        for entity_type in ['projects', 'bundles']:
            self.assertEqual([1, 1], expected_digest[entity_type])

        # Test partitioning and contribution
        for i in range(2):
            mock_plugin = MagicMock()
            notified_fqids = list(
                map(self._fqid_from_notification, notifications))
            notified_bundles = [bundles[fqid] for fqid in notified_fqids]
            mock_plugin.fetch_bundle.side_effect = notified_bundles
            mock_plugin.source_from_json.return_value = source
            mock_plugin.sources = [source]
            with patch.object(IndexService,
                              'repository_plugin',
                              return_value=mock_plugin):
                with patch.object(BundlePartition, 'max_partition_size', 4):
                    event = list(map(self._mock_sqs_record, notifications))
                    self.controller.contribute(event)

            # Assert plugin calls by controller
            expected_calls = [call(source.to_json())] * len(notified_fqids)
            self.assertEqual(expected_calls,
                             mock_plugin.source_from_json.mock_calls)
            expected_calls = list(map(call, notified_fqids))
            self.assertEqual(expected_calls,
                             mock_plugin.fetch_bundle.mock_calls)

            # Assert partitioned notifications, straight from the retry queue
            notifications = self._read_queue(self._notifications_retry_queue)
            if i == 0:
                # Fingerprint the partitions from the resulting notifications
                partitions = defaultdict(set)
                for n in notifications:
                    fqid = self._fqid_from_notification(n)
                    partition = BundlePartition.from_json(
                        n['notification']['partition'])
                    partitions[fqid].add(partition)
                # Assert that each bundle was paritioned ...
                self.assertEqual(partitions.keys(), set(fqids))
                # ... into two partitions. The number of partitions depends on
                # the patched max_partition_size above and the number of
                # entities in the canned bundles.
                self.assertEqual([2] * len(fqids),
                                 list(map(len, partitions.values())))
            else:
                # The partitions resulting from the first iteration should not
                # need to be paritioned again
                self.assertEqual([], notifications)

        # We got a tally of one for each
        tallies = self._read_queue(self._tallies_queue)
        digest = self._digest_tallies(tallies)
        self.assertEqual(expected_digest, digest)

        # Test aggregation
        notifications = map(partial(self._mock_sqs_record), tallies)
        with patch.object(IndexWriter, 'write', side_effect=TransportError):
            try:
                self.controller.aggregate(notifications)
            except TransportError:
                pass
            else:
                self.fail()

        self.assertEqual([], self._read_queue(self._tallies_queue))

        # Poison the two project and the two bundle tallies, by simulating
        # a number of failed attempts at processing them
        attempts = self.controller.num_batched_aggregation_attempts
        # While 0 is a valid value, the test logic below wouldn't work with it
        self.assertGreater(attempts, 0)
        notifications = [
            self._mock_sqs_record(
                tally,
                attempts=(attempts + 1 if tally['entity_type']
                          in {'bundles', 'projects'} else 1))
            for tally in tallies
        ]
        self.controller.aggregate(notifications, retry=True)

        tallies = self._read_queue(self._tallies_retry_queue)
        digest = self._digest_tallies(tallies)
        # The two project tallies were consolidated (despite being poisoned) and
        # the resulting tally was deferred
        expected_digest['projects'] = [2]
        # One of the poisoned bundle tallies was referred. Since it was
        # poisoned, all other tallies were deferred
        expected_digest['bundles'] = [1]
        self.assertEqual(expected_digest, digest)

        # Aggregate the remaining deferred tallies
        notifications = map(self._mock_sqs_record, tallies)
        self.controller.aggregate(notifications, retry=True)

        # All tallies were referred
        self.assertEqual([], self._read_queue(self._tallies_retry_queue))
        self.assertEqual([], self._read_queue(self._tallies_queue))
Ejemplo n.º 12
0
 def _fqid_from_notification(self, notification):
     return SourcedBundleFQID(
         uuid=notification['notification']['match']['bundle_uuid'],
         version=notification['notification']['match']['bundle_version'],
         source=DSSSourceRef.from_json(
             notification['notification']['source']))
Ejemplo n.º 13
0
 def bundle_fqid(cls, *, uuid, version):
     return SourcedBundleFQID(source=cls.source, uuid=uuid, version=version)
Ejemplo n.º 14
0
def main(argv):
    """
    Load a canned bundle from DCP/1 and write *.manifest.tdr and *.metadata.tdr
    files showing the desired output for DCP/2.
    """
    default_version = datetime(year=2021, month=1, day=17, hour=0)

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--bundle-uuid',
                        '-b',
                        default=TestTDRPlugin.bundle_uuid,
                        help='The UUID of the existing DCP/1 canned bundle.')
    parser.add_argument(
        '--source-id',
        '-s',
        default=TestTDRPlugin.snapshot_id,
        help=
        'The UUID of the snapshot/dataset to contain the canned DCP/2 bundle.')
    parser.add_argument(
        '--version',
        '-v',
        default=tdr.Plugin.format_version(default_version),
        help='The version for any mock entities synthesized by the script.')
    parser.add_argument('--input-dir',
                        '-I',
                        default=os.path.join(config.project_root, 'test',
                                             'indexer', 'data'),
                        help='The path to the input directory.')
    parser.add_argument(
        '--mock-supplementary-files',
        '-S',
        type=int,
        default=0,
        help='The number of mock supplementary files to add to the output.')
    args = parser.parse_args(argv)

    paths = file_paths(args.input_dir, args.bundle_uuid)

    log.debug('Reading canned bundle %r from %r', args.bundle_uuid,
              paths['dss'])
    with open(paths['dss']['manifest']) as f:
        manifest = json.load(f)
    with open(paths['dss']['metadata']) as f:
        metadata = json.load(f)

    dss_source = DSSSourceRef(id='',
                              spec=SimpleSourceSpec(
                                  prefix=Prefix.of_everything,
                                  name=config.dss_endpoint))
    dss_bundle = DSSBundle(fqid=SourcedBundleFQID(source=dss_source,
                                                  uuid=args.bundle_uuid,
                                                  version=''),
                           manifest=manifest,
                           metadata_files=metadata)

    tdr_source = TDRSourceRef(id=args.source_id,
                              spec=TDRSourceSpec(prefix=Prefix.of_everything,
                                                 project='test_project',
                                                 name='test_name',
                                                 is_snapshot=True))
    tdr_bundle = dss_bundle_to_tdr(dss_bundle, tdr_source)

    add_supp_files(tdr_bundle,
                   num_files=args.mock_supplementary_files,
                   version=args.version)

    log.debug('Writing converted bundle %r to %r', args.bundle_uuid,
              paths['tdr'])
    with write_file_atomically(paths['tdr']['result']) as f:
        json.dump(
            {
                'manifest': tdr_bundle.manifest,
                'metadata': tdr_bundle.metadata_files
            },
            f,
            indent=4)

    with write_file_atomically(paths['tdr']['tables']) as f:
        json.dump(dump_tables(tdr_bundle), f, indent=4)
Ejemplo n.º 15
0
def dss_bundle_to_tdr(bundle: Bundle, source: TDRSourceRef) -> TDRBundle:
    metadata = copy_json(bundle.metadata_files)

    # Order entities by UUID for consistency with Plugin output.
    entities_by_type: Mapping[str, MutableJSONs] = defaultdict(list)
    for k, v in bundle.metadata_files.items():
        if k != 'links.json':
            entity_type = k.rsplit('_', 1)[0]
            entities_by_type[entity_type].append(v)
    for (entity_type, entities) in entities_by_type.items():
        entities.sort(key=lambda e: e['provenance']['document_id'])
        for i, entity in enumerate(entities):
            name = f'{entity_type}_{i}.json'
            bundle.metadata_files[name] = entity
            manifest_entry = find_manifest_entry(
                bundle, entity['provenance']['document_id'])
            manifest_entry['name'] = name

    bundle.manifest.sort(key=itemgetter('uuid'))

    links_json = metadata['links.json']
    links_json['schema_type'] = 'links'  # DCP/1 uses 'link_bundle'
    for link in links_json['links']:
        process_id = link.pop('process')
        link['process_id'] = process_id
        link['process_type'] = find_concrete_type(
            bundle, find_file_name(bundle, process_id))
        link[
            'link_type'] = 'process_link'  # No supplementary files in DCP/1 bundles
        for component in ('input',
                          'output'):  # Protocols already in desired format
            del link[
                f'{component}_type']  # Replace abstract type with concrete types
            component_list = link[f'{component}s']
            component_list[:] = [{
                f'{component}_id':
                component_id,
                f'{component}_type':
                find_concrete_type(bundle,
                                   find_file_name(bundle, component_id))
            } for component_id in component_list]

    manifest: MutableJSONs = copy_jsons(bundle.manifest)
    links_entry = None
    for entry in manifest:
        entry['version'] = convert_version(entry['version'])
        entry['is_stitched'] = False
        if entry['name'] == 'links.json':
            links_entry = entry
        if entry['indexed']:
            entity_json = metadata[entry['name']]
            # Size of the entity JSON in TDR, not the size of pretty-printed
            # output file.
            entry['size'] = content_length(entity_json)
            # Only include mandatory checksums
            del entry['sha1']
            del entry['s3_etag']
            entry['crc32c'] = ''
            entry['sha256'] = ''
        else:
            entry['drs_path'] = drs_path(
                source.id, deterministic_uuid(bundle.uuid, entry['uuid']))
    manifest.sort(key=itemgetter('uuid'))

    assert links_entry is not None
    # links.json has no FQID of its own in TDR since its FQID is used
    # for the entire bundle.
    links_entry['uuid'] = bundle.uuid
    return TDRBundle(fqid=SourcedBundleFQID(source=source,
                                            uuid=links_entry['uuid'],
                                            version=links_entry['version']),
                     manifest=manifest,
                     metadata_files=metadata)