def main(argv): parser = argparse.ArgumentParser( description='Delete bundles from Azul index.') parser.add_argument( '--catalog', metavar='NAME', default=config.default_catalog, choices=config.catalogs, help='The name of the catalog to delete the bundles from.') parser.add_argument( 'bundles', metavar='UUID.VERSION', type=parse_fqid, nargs='+', help='One or more references of the bundles to be deleted.') args = parser.parse_args(argv) bundles = args.bundles azul_client = AzulClient() for bundle in bundles: try: bundle_uuid, bundle_version = bundle except ValueError: parser.parse_args(['--help']) else: azul_client.delete_bundle(args.catalog, bundle_uuid, bundle_version)
def main(argv: List[str]): args = parser.parse_args(argv) if args.verbose: config.debug = 1 configure_script_logging(logger) azul_client = AzulClient(prefix=args.prefix, num_workers=args.num_workers) azul_client.reset_indexer(args.catalogs, purge_queues=args.purge, delete_indices=args.delete, create_indices=args.create or args.index and args.delete) if args.index: logger.info('Queuing notifications for reindexing ...') for catalog in args.catalogs: if args.partition_prefix_length: azul_client.remote_reindex(catalog, args.partition_prefix_length) else: azul_client.reindex(catalog) if args.wait: # Match max_timeout to reindex job timeout in `.gitlab-ci.yml` azul_client.wait_for_indexer( min_timeout=20 * 60 if config.dss_query_prefix else None, max_timeout=13 * 60 * 60)
def setUp(self) -> None: super().setUp() self.index_service.create_indices(self.catalog) self.client = AzulClient() self.controller = IndexController() # noinspection PyPropertyAccess self.controller.index_service = self.index_service self.queue_manager = queues.Queues(delete=True)
def contribute(self, event: Iterable[SQSRecord], *, retry=False): for record in event: message = json.loads(record.body) attempts = record.to_dict()['attributes']['ApproximateReceiveCount'] log.info('Worker handling message %r, attempt #%r (approx).', message, attempts) start = time.time() try: action = Action[message['action']] if action is Action.reindex: AzulClient().remote_reindex_partition(message) else: notification = message['notification'] catalog = message['catalog'] assert catalog is not None delete = action.is_delete() contributions = self.transform(catalog, notification, delete) log.info('Writing %i contributions to index.', len(contributions)) tallies = self.index_service.contribute(catalog, contributions) tallies = [DocumentTally.for_entity(catalog, entity, num_contributions) for entity, num_contributions in tallies.items()] log.info('Queueing %i entities for aggregating a total of %i contributions.', len(tallies), sum(tally.num_contributions for tally in tallies)) for batch in chunked(tallies, self.document_batch_size): entries = [dict(tally.to_message(), Id=str(i)) for i, tally in enumerate(batch)] self._tallies_queue().send_messages(Entries=entries) except BaseException: log.warning(f'Worker failed to handle message {message}.', exc_info=True) raise else: duration = time.time() - start log.info(f'Worker successfully handled message {message} in {duration:.3f}s.')
class TestIndexController(IndexerTestCase): partition_prefix_length = 0 def setUp(self) -> None: super().setUp() self.index_service.create_indices(self.catalog) self.client = AzulClient() self.controller = IndexController() # noinspection PyPropertyAccess self.controller.index_service = self.index_service self.queue_manager = queues.Queues(delete=True) def tearDown(self): self.index_service.delete_indices(self.catalog) super().tearDown() def _create_mock_queues(self): sqs = aws.resource('sqs') for queue_name in config.all_queue_names: sqs.create_queue(QueueName=queue_name) def _mock_sqs_record(self, body, *, attempts: int = 1): event_dict = { 'body': json.dumps(body), 'receiptHandle': 'ThisWasARandomString', 'attributes': { 'ApproximateReceiveCount': attempts } } return SQSRecord(event_dict=event_dict, context='controller_test') @property def _notifications_queue(self): return self.controller._notifications_queue() @property def _notifications_retry_queue(self): return self.controller._notifications_queue(retry=True) @property def _tallies_queue(self): return self.controller._tallies_queue() @property def _tallies_retry_queue(self): return self.controller._tallies_queue(retry=True) def _read_queue(self, queue) -> JSONs: messages = self.queue_manager.read_messages(queue) tallies = [json.loads(m.body) for m in messages] return tallies def _fqid_from_notification(self, notification): return SourcedBundleFQID( uuid=notification['notification']['match']['bundle_uuid'], version=notification['notification']['match']['bundle_version'], source=DSSSourceRef.from_json( notification['notification']['source'])) def test_invalid_notification(self): event = [ self._mock_sqs_record( dict(action='foo', source='foo_source', notification='bar', catalog=self.catalog)) ] self.assertRaises(KeyError, self.controller.contribute, event) def test_remote_reindex(self): with patch.dict( os.environ, dict(AZUL_DSS_QUERY_PREFIX='ff', AZUL_DSS_SOURCE='foo_source:/0')): source = DSSSourceRef.for_dss_source(config.dss_source) self.index_service.repository_plugin( self.catalog)._assert_source(source) self._create_mock_queues() self.client.remote_reindex(self.catalog, {str(source.spec)}) notification = one(self._read_queue(self._notifications_queue)) expected_notification = dict(action='reindex', catalog='test', source=source.to_json(), prefix='') self.assertEqual(expected_notification, notification) event = [self._mock_sqs_record(notification)] bundle_fqids = [ SourcedBundleFQID(source=source, uuid='ffa338fe-7554-4b5d-96a2-7df127a7640b', version='2018-03-28T15:10:23.074974Z') ] with patch.object(Plugin, 'list_bundles', return_value=bundle_fqids): self.controller.contribute(event) notification = one(self._read_queue(self._notifications_queue)) expected_source = dict(id=source.id, spec=str(source.spec)) source = notification['notification']['source'] self.assertEqual(expected_source, source) def test_contribute_and_aggregate(self): """ Contribution and aggregation of two bundles Index two bundles that make contributions to the same project. Inspect that the contributions match the tallies that are returned to SQS. During aggregation only the project entity is deferred due to multiple contributions. """ self.maxDiff = None self._create_mock_queues() source = DSSSourceRef.for_dss_source('foo_source:/0') fqids = [ SourcedBundleFQID(source=source, uuid='56a338fe-7554-4b5d-96a2-7df127a7640b', version='2018-03-28T15:10:23.074974Z'), SourcedBundleFQID(source=source, uuid='b2216048-7eaa-45f4-8077-5a3fb4204953', version='2018-03-29T10:40:41.822717Z') ] # Load canned bundles bundles = {fqid: self._load_canned_bundle(fqid) for fqid in fqids} # Synthesize initial notifications notifications = [ dict(action='add', catalog=self.catalog, notification=self.client.synthesize_notification(fqid)) for fqid in fqids ] # Invoke the service once to produce a set of expected entities so we # don't need to hard-code them. Keep in mind that this test is not # intended to cover the service, only the controller. expected_digest = defaultdict(list) for fqid, bundle in bundles.items(): contributions = self.index_service.transform(self.catalog, bundle, delete=False) for contribution in contributions: assert isinstance(contribution, Contribution) # Initially, each entity gets a tally of 1 expected_digest[contribution.entity.entity_type].append(1) # Prove that we have two contributions per "container" type, for when we # test poison tallies and deferrals below. Note that the two project # contributions are to the same entity, the bundle contributions are not. for entity_type in ['projects', 'bundles']: self.assertEqual([1, 1], expected_digest[entity_type]) # Test partitioning and contribution for i in range(2): mock_plugin = MagicMock() notified_fqids = list( map(self._fqid_from_notification, notifications)) notified_bundles = [bundles[fqid] for fqid in notified_fqids] mock_plugin.fetch_bundle.side_effect = notified_bundles mock_plugin.source_from_json.return_value = source mock_plugin.sources = [source] with patch.object(IndexService, 'repository_plugin', return_value=mock_plugin): with patch.object(BundlePartition, 'max_partition_size', 4): event = list(map(self._mock_sqs_record, notifications)) self.controller.contribute(event) # Assert plugin calls by controller expected_calls = [call(source.to_json())] * len(notified_fqids) self.assertEqual(expected_calls, mock_plugin.source_from_json.mock_calls) expected_calls = list(map(call, notified_fqids)) self.assertEqual(expected_calls, mock_plugin.fetch_bundle.mock_calls) # Assert partitioned notifications, straight from the retry queue notifications = self._read_queue(self._notifications_retry_queue) if i == 0: # Fingerprint the partitions from the resulting notifications partitions = defaultdict(set) for n in notifications: fqid = self._fqid_from_notification(n) partition = BundlePartition.from_json( n['notification']['partition']) partitions[fqid].add(partition) # Assert that each bundle was paritioned ... self.assertEqual(partitions.keys(), set(fqids)) # ... into two partitions. The number of partitions depends on # the patched max_partition_size above and the number of # entities in the canned bundles. self.assertEqual([2] * len(fqids), list(map(len, partitions.values()))) else: # The partitions resulting from the first iteration should not # need to be paritioned again self.assertEqual([], notifications) # We got a tally of one for each tallies = self._read_queue(self._tallies_queue) digest = self._digest_tallies(tallies) self.assertEqual(expected_digest, digest) # Test aggregation notifications = map(partial(self._mock_sqs_record), tallies) with patch.object(IndexWriter, 'write', side_effect=TransportError): try: self.controller.aggregate(notifications) except TransportError: pass else: self.fail() self.assertEqual([], self._read_queue(self._tallies_queue)) # Poison the two project and the two bundle tallies, by simulating # a number of failed attempts at processing them attempts = self.controller.num_batched_aggregation_attempts # While 0 is a valid value, the test logic below wouldn't work with it self.assertGreater(attempts, 0) notifications = [ self._mock_sqs_record( tally, attempts=(attempts + 1 if tally['entity_type'] in {'bundles', 'projects'} else 1)) for tally in tallies ] self.controller.aggregate(notifications, retry=True) tallies = self._read_queue(self._tallies_retry_queue) digest = self._digest_tallies(tallies) # The two project tallies were consolidated (despite being poisoned) and # the resulting tally was deferred expected_digest['projects'] = [2] # One of the poisoned bundle tallies was referred. Since it was # poisoned, all other tallies were deferred expected_digest['bundles'] = [1] self.assertEqual(expected_digest, digest) # Aggregate the remaining deferred tallies notifications = map(self._mock_sqs_record, tallies) self.controller.aggregate(notifications, retry=True) # All tallies were referred self.assertEqual([], self._read_queue(self._tallies_retry_queue)) self.assertEqual([], self._read_queue(self._tallies_queue)) def _digest_tallies(self, tallies): entities = defaultdict(list) for tally in tallies: insort(entities[tally['entity_type']], tally['num_contributions']) return entities
import logging import shutil import sys from typing import ( List, ) from azul import ( config, ) from azul.azulclient import ( AzulClient, ) from azul.logging import ( configure_script_logging, ) logger = logging.getLogger(__name__) defaults = AzulClient() def my_formatter(prog: str): # This should be a subclass of ArgumentDefaultsHelpFormatter instead of a # factory function but doing so causes a false type check warning in PyCharm # because it uses a typeshed stub for argparse which maybe buggy or # something PyCharm doesn't understand. return argparse.ArgumentDefaultsHelpFormatter( prog, max_help_position=50, width=min(shutil.get_terminal_size((80, 25)).columns, 120)) parser = argparse.ArgumentParser(description=__doc__, formatter_class=my_formatter)
class TestIndexController(IndexerTestCase): def setUp(self) -> None: super().setUp() self.index_service.create_indices(self.catalog) self.client = AzulClient() self.controller = IndexController() # noinspection PyPropertyAccess self.controller.index_service = self.index_service self.queue_manager = queues.Queues(delete=True) def tearDown(self): self.index_service.delete_indices(self.catalog) super().tearDown() def _create_mock_queues(self): sqs = boto3.resource('sqs') for queue_name in config.all_queue_names: sqs.create_queue(QueueName=queue_name) def _mock_sqs_record(self, **body): event_dict = { 'body': json.dumps(body), 'receiptHandle': 'ThisWasARandomString', 'attributes': {'ApproximateReceiveCount': 1} } return SQSRecord(event_dict=event_dict, context='controller_test') def test_invalid_notification(self): event = [ self._mock_sqs_record(action='foo', notification='bar', catalog=self.catalog) ] self.assertRaises(AssertionError, self.controller.contribute, event) def test_remote_reindex(self): event = [self._mock_sqs_record(action='reindex', prefix='ff')] with mock.patch.object(AzulClient, 'do_remote_reindex') as mock_reindex: mock_reindex.return_value = True self.controller.contribute(event) mock_reindex.assert_called_once_with(dict(action='reindex', prefix='ff')) def test_contribute_and_aggregate(self): """ Contribution and aggregation of two bundles Index two bundles that make contributions to the same project. Inspect that the contributions match the tallies that are returned to SQS. During aggregation only the project entity is deferred due to multiple contributions. """ self._create_mock_queues() event = [] bundles = [] expected_entities = set() bundle_fqids = [ BundleFQID('56a338fe-7554-4b5d-96a2-7df127a7640b', '2018-03-28T151023.074974Z'), BundleFQID('b2216048-7eaa-45f4-8077-5a3fb4204953', '2018-03-29T104041.822717Z') ] for bundle_fqid in bundle_fqids: notification = self.client.synthesize_notification(self.catalog, bundle_fqid) event.append(self._mock_sqs_record(action='add', catalog=self.catalog, notification=notification)) bundle = self._load_canned_bundle(bundle_fqid) bundles.append(bundle) # Invoke the service once to produce a set of expected entities so # we don't need to hard-code them. Keep in mind that this test is # not covering the service, only the controller. contributions = self.index_service.transform(self.catalog, bundle, delete=False) expected_entities.update( (c.entity.entity_id, c.entity.entity_type) for c in contributions ) mock_plugin = mock.MagicMock() mock_plugin.fetch_bundle.side_effect = bundles with mock.patch.object(IndexController, 'repository_plugin', return_value=mock_plugin): # Test contribution self.controller.contribute(event) tallies = [ json.loads(m.body) for m in self.queue_manager.read_messages(self.controller._tallies_queue()) ] entities_from_tallies = { (t['entity_id'], t['entity_type']) for t in tallies } self.assertSetEqual(expected_entities, entities_from_tallies) self.assertListEqual([mock.call(f) for f in bundle_fqids], mock_plugin.fetch_bundle.mock_calls) # Test aggregation for tallies, inspect for deferred tallies event = [self._mock_sqs_record(**t) for t in tallies] self.controller.aggregate(event) messages = self.queue_manager.read_messages(self.controller._tallies_queue()) # Check that aggregation of project entity was deferred project_tally = json.loads(one(messages).body) expected_tally = { 'catalog': 'test', 'entity_type': 'projects', 'entity_id': '93f6a42f-1790-4af4-b5d1-8c436cb6feae', 'num_contributions': 2 } self.assertDictEqual(project_tally, expected_tally) # Test aggregation of deferred project entity event = [self._mock_sqs_record(**project_tally)] self.controller.aggregate(event) messages = self.queue_manager.read_messages(self.controller._tallies_queue()) self.assertEqual(0, len(messages))
def main(argv: List[str]): args = parser.parse_args(argv) if args.verbose: config.debug = 1 configure_script_logging(logger) azul = AzulClient(num_workers=args.num_workers) source_globs = set(args.sources) if not args.local or args.deindex: sources_by_catalog = defaultdict(set) globs_matched = set() for catalog in args.catalogs: sources = azul.catalog_sources(catalog) for source_glob in source_globs: matches = fnmatch.filter(sources, source_glob) if matches: globs_matched.add(source_glob) logger.debug('Source glob %r matched sources %r in catalog %r', source_glob, matches, catalog) sources_by_catalog[catalog].update(matches) unmatched = source_globs - globs_matched if unmatched: logger.warning('Source(s) not found in any catalog: %r', unmatched) require(any(sources_by_catalog.values()), 'No valid sources specified for any catalog') else: if source_globs == {'*'}: sources_by_catalog = { catalog: azul.catalog_sources(catalog) for catalog in args.catalogs } else: parser.error('Cannot specify sources when performing a local reindex') assert False if args.deindex: require(not any((args.index, args.delete, args.create)), '--deindex is incompatible with --index, --create, and --delete.') require('*' not in source_globs, '--deindex is incompatible with source `*`. ' 'Use --delete instead.') for catalog, sources in sources_by_catalog.items(): if sources: azul.deindex(catalog, sources) azul.reset_indexer(args.catalogs, purge_queues=args.purge, delete_indices=args.delete, create_indices=args.create or args.index and args.delete) if args.index: logger.info('Queuing notifications for reindexing ...') reservation = None num_notifications = 0 for catalog, sources in sources_by_catalog.items(): if sources: if ( args.manage_slots and reservation is None and isinstance(azul.repository_plugin(catalog), tdr.Plugin) ): reservation = BigQueryReservation() reservation.activate() if not args.local: azul.remote_reindex(catalog, sources) num_notifications = None else: num_notifications += azul.reindex(catalog, args.prefix) else: logger.info('Skipping catalog %r (no matching sources)', catalog) if args.wait: if num_notifications == 0: logger.warning('No notifications for prefix %r and catalogs %r were sent', args.prefix, args.catalogs) else: azul.wait_for_indexer()
def azul_client(self): return AzulClient(prefix=self.bundle_uuid_prefix)