Esempio n. 1
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='Delete bundles from Azul index.')
    parser.add_argument(
        '--catalog',
        metavar='NAME',
        default=config.default_catalog,
        choices=config.catalogs,
        help='The name of the catalog to delete the bundles from.')
    parser.add_argument(
        'bundles',
        metavar='UUID.VERSION',
        type=parse_fqid,
        nargs='+',
        help='One or more references of the bundles to be deleted.')
    args = parser.parse_args(argv)
    bundles = args.bundles
    azul_client = AzulClient()
    for bundle in bundles:
        try:
            bundle_uuid, bundle_version = bundle
        except ValueError:
            parser.parse_args(['--help'])
        else:
            azul_client.delete_bundle(args.catalog, bundle_uuid,
                                      bundle_version)
Esempio n. 2
0
def main(argv: List[str]):
    args = parser.parse_args(argv)

    if args.verbose:
        config.debug = 1

    configure_script_logging(logger)

    azul_client = AzulClient(prefix=args.prefix, num_workers=args.num_workers)

    azul_client.reset_indexer(args.catalogs,
                              purge_queues=args.purge,
                              delete_indices=args.delete,
                              create_indices=args.create
                              or args.index and args.delete)

    if args.index:
        logger.info('Queuing notifications for reindexing ...')
        for catalog in args.catalogs:
            if args.partition_prefix_length:
                azul_client.remote_reindex(catalog,
                                           args.partition_prefix_length)
            else:
                azul_client.reindex(catalog)
        if args.wait:
            # Match max_timeout to reindex job timeout in `.gitlab-ci.yml`
            azul_client.wait_for_indexer(
                min_timeout=20 * 60 if config.dss_query_prefix else None,
                max_timeout=13 * 60 * 60)
 def setUp(self) -> None:
     super().setUp()
     self.index_service.create_indices(self.catalog)
     self.client = AzulClient()
     self.controller = IndexController()
     # noinspection PyPropertyAccess
     self.controller.index_service = self.index_service
     self.queue_manager = queues.Queues(delete=True)
Esempio n. 4
0
    def contribute(self, event: Iterable[SQSRecord], *, retry=False):
        for record in event:
            message = json.loads(record.body)
            attempts = record.to_dict()['attributes']['ApproximateReceiveCount']
            log.info('Worker handling message %r, attempt #%r (approx).',
                     message, attempts)
            start = time.time()
            try:
                action = Action[message['action']]
                if action is Action.reindex:
                    AzulClient().remote_reindex_partition(message)
                else:
                    notification = message['notification']
                    catalog = message['catalog']
                    assert catalog is not None
                    delete = action.is_delete()
                    contributions = self.transform(catalog, notification, delete)
                    log.info('Writing %i contributions to index.', len(contributions))
                    tallies = self.index_service.contribute(catalog, contributions)
                    tallies = [DocumentTally.for_entity(catalog, entity, num_contributions)
                               for entity, num_contributions in tallies.items()]

                    log.info('Queueing %i entities for aggregating a total of %i contributions.',
                             len(tallies), sum(tally.num_contributions for tally in tallies))
                    for batch in chunked(tallies, self.document_batch_size):
                        entries = [dict(tally.to_message(), Id=str(i)) for i, tally in enumerate(batch)]
                        self._tallies_queue().send_messages(Entries=entries)
            except BaseException:
                log.warning(f'Worker failed to handle message {message}.', exc_info=True)
                raise
            else:
                duration = time.time() - start
                log.info(f'Worker successfully handled message {message} in {duration:.3f}s.')
class TestIndexController(IndexerTestCase):
    partition_prefix_length = 0

    def setUp(self) -> None:
        super().setUp()
        self.index_service.create_indices(self.catalog)
        self.client = AzulClient()
        self.controller = IndexController()
        # noinspection PyPropertyAccess
        self.controller.index_service = self.index_service
        self.queue_manager = queues.Queues(delete=True)

    def tearDown(self):
        self.index_service.delete_indices(self.catalog)
        super().tearDown()

    def _create_mock_queues(self):
        sqs = aws.resource('sqs')
        for queue_name in config.all_queue_names:
            sqs.create_queue(QueueName=queue_name)

    def _mock_sqs_record(self, body, *, attempts: int = 1):
        event_dict = {
            'body': json.dumps(body),
            'receiptHandle': 'ThisWasARandomString',
            'attributes': {
                'ApproximateReceiveCount': attempts
            }
        }
        return SQSRecord(event_dict=event_dict, context='controller_test')

    @property
    def _notifications_queue(self):
        return self.controller._notifications_queue()

    @property
    def _notifications_retry_queue(self):
        return self.controller._notifications_queue(retry=True)

    @property
    def _tallies_queue(self):
        return self.controller._tallies_queue()

    @property
    def _tallies_retry_queue(self):
        return self.controller._tallies_queue(retry=True)

    def _read_queue(self, queue) -> JSONs:
        messages = self.queue_manager.read_messages(queue)
        tallies = [json.loads(m.body) for m in messages]
        return tallies

    def _fqid_from_notification(self, notification):
        return SourcedBundleFQID(
            uuid=notification['notification']['match']['bundle_uuid'],
            version=notification['notification']['match']['bundle_version'],
            source=DSSSourceRef.from_json(
                notification['notification']['source']))

    def test_invalid_notification(self):
        event = [
            self._mock_sqs_record(
                dict(action='foo',
                     source='foo_source',
                     notification='bar',
                     catalog=self.catalog))
        ]
        self.assertRaises(KeyError, self.controller.contribute, event)

    def test_remote_reindex(self):
        with patch.dict(
                os.environ,
                dict(AZUL_DSS_QUERY_PREFIX='ff',
                     AZUL_DSS_SOURCE='foo_source:/0')):
            source = DSSSourceRef.for_dss_source(config.dss_source)
            self.index_service.repository_plugin(
                self.catalog)._assert_source(source)
            self._create_mock_queues()
            self.client.remote_reindex(self.catalog, {str(source.spec)})
            notification = one(self._read_queue(self._notifications_queue))
            expected_notification = dict(action='reindex',
                                         catalog='test',
                                         source=source.to_json(),
                                         prefix='')
            self.assertEqual(expected_notification, notification)
            event = [self._mock_sqs_record(notification)]

            bundle_fqids = [
                SourcedBundleFQID(source=source,
                                  uuid='ffa338fe-7554-4b5d-96a2-7df127a7640b',
                                  version='2018-03-28T15:10:23.074974Z')
            ]

            with patch.object(Plugin,
                              'list_bundles',
                              return_value=bundle_fqids):
                self.controller.contribute(event)

            notification = one(self._read_queue(self._notifications_queue))
            expected_source = dict(id=source.id, spec=str(source.spec))
            source = notification['notification']['source']
            self.assertEqual(expected_source, source)

    def test_contribute_and_aggregate(self):
        """
        Contribution and aggregation of two bundles

        Index two bundles that make contributions to the same project. Inspect
        that the contributions match the tallies that are returned to SQS.
        During aggregation only the project entity is deferred due to
        multiple contributions.
        """
        self.maxDiff = None
        self._create_mock_queues()
        source = DSSSourceRef.for_dss_source('foo_source:/0')
        fqids = [
            SourcedBundleFQID(source=source,
                              uuid='56a338fe-7554-4b5d-96a2-7df127a7640b',
                              version='2018-03-28T15:10:23.074974Z'),
            SourcedBundleFQID(source=source,
                              uuid='b2216048-7eaa-45f4-8077-5a3fb4204953',
                              version='2018-03-29T10:40:41.822717Z')
        ]

        # Load canned bundles
        bundles = {fqid: self._load_canned_bundle(fqid) for fqid in fqids}

        # Synthesize initial notifications
        notifications = [
            dict(action='add',
                 catalog=self.catalog,
                 notification=self.client.synthesize_notification(fqid))
            for fqid in fqids
        ]

        # Invoke the service once to produce a set of expected entities so we
        # don't need to hard-code them. Keep in mind that this test is not
        # intended to cover the service, only the controller.
        expected_digest = defaultdict(list)
        for fqid, bundle in bundles.items():
            contributions = self.index_service.transform(self.catalog,
                                                         bundle,
                                                         delete=False)
            for contribution in contributions:
                assert isinstance(contribution, Contribution)
                # Initially, each entity gets a tally of 1
                expected_digest[contribution.entity.entity_type].append(1)

        # Prove that we have two contributions per "container" type, for when we
        # test poison tallies and deferrals below. Note that the two project
        # contributions are to the same entity, the bundle contributions are not.
        for entity_type in ['projects', 'bundles']:
            self.assertEqual([1, 1], expected_digest[entity_type])

        # Test partitioning and contribution
        for i in range(2):
            mock_plugin = MagicMock()
            notified_fqids = list(
                map(self._fqid_from_notification, notifications))
            notified_bundles = [bundles[fqid] for fqid in notified_fqids]
            mock_plugin.fetch_bundle.side_effect = notified_bundles
            mock_plugin.source_from_json.return_value = source
            mock_plugin.sources = [source]
            with patch.object(IndexService,
                              'repository_plugin',
                              return_value=mock_plugin):
                with patch.object(BundlePartition, 'max_partition_size', 4):
                    event = list(map(self._mock_sqs_record, notifications))
                    self.controller.contribute(event)

            # Assert plugin calls by controller
            expected_calls = [call(source.to_json())] * len(notified_fqids)
            self.assertEqual(expected_calls,
                             mock_plugin.source_from_json.mock_calls)
            expected_calls = list(map(call, notified_fqids))
            self.assertEqual(expected_calls,
                             mock_plugin.fetch_bundle.mock_calls)

            # Assert partitioned notifications, straight from the retry queue
            notifications = self._read_queue(self._notifications_retry_queue)
            if i == 0:
                # Fingerprint the partitions from the resulting notifications
                partitions = defaultdict(set)
                for n in notifications:
                    fqid = self._fqid_from_notification(n)
                    partition = BundlePartition.from_json(
                        n['notification']['partition'])
                    partitions[fqid].add(partition)
                # Assert that each bundle was paritioned ...
                self.assertEqual(partitions.keys(), set(fqids))
                # ... into two partitions. The number of partitions depends on
                # the patched max_partition_size above and the number of
                # entities in the canned bundles.
                self.assertEqual([2] * len(fqids),
                                 list(map(len, partitions.values())))
            else:
                # The partitions resulting from the first iteration should not
                # need to be paritioned again
                self.assertEqual([], notifications)

        # We got a tally of one for each
        tallies = self._read_queue(self._tallies_queue)
        digest = self._digest_tallies(tallies)
        self.assertEqual(expected_digest, digest)

        # Test aggregation
        notifications = map(partial(self._mock_sqs_record), tallies)
        with patch.object(IndexWriter, 'write', side_effect=TransportError):
            try:
                self.controller.aggregate(notifications)
            except TransportError:
                pass
            else:
                self.fail()

        self.assertEqual([], self._read_queue(self._tallies_queue))

        # Poison the two project and the two bundle tallies, by simulating
        # a number of failed attempts at processing them
        attempts = self.controller.num_batched_aggregation_attempts
        # While 0 is a valid value, the test logic below wouldn't work with it
        self.assertGreater(attempts, 0)
        notifications = [
            self._mock_sqs_record(
                tally,
                attempts=(attempts + 1 if tally['entity_type']
                          in {'bundles', 'projects'} else 1))
            for tally in tallies
        ]
        self.controller.aggregate(notifications, retry=True)

        tallies = self._read_queue(self._tallies_retry_queue)
        digest = self._digest_tallies(tallies)
        # The two project tallies were consolidated (despite being poisoned) and
        # the resulting tally was deferred
        expected_digest['projects'] = [2]
        # One of the poisoned bundle tallies was referred. Since it was
        # poisoned, all other tallies were deferred
        expected_digest['bundles'] = [1]
        self.assertEqual(expected_digest, digest)

        # Aggregate the remaining deferred tallies
        notifications = map(self._mock_sqs_record, tallies)
        self.controller.aggregate(notifications, retry=True)

        # All tallies were referred
        self.assertEqual([], self._read_queue(self._tallies_retry_queue))
        self.assertEqual([], self._read_queue(self._tallies_queue))

    def _digest_tallies(self, tallies):
        entities = defaultdict(list)
        for tally in tallies:
            insort(entities[tally['entity_type']], tally['num_contributions'])
        return entities
Esempio n. 6
0
import logging
import shutil
import sys
from typing import (
    List, )

from azul import (
    config, )
from azul.azulclient import (
    AzulClient, )
from azul.logging import (
    configure_script_logging, )

logger = logging.getLogger(__name__)

defaults = AzulClient()


def my_formatter(prog: str):
    # This should be a subclass of ArgumentDefaultsHelpFormatter instead of a
    # factory function but doing so causes a false type check warning in PyCharm
    # because it uses a typeshed stub for argparse which maybe buggy or
    # something PyCharm doesn't understand.
    return argparse.ArgumentDefaultsHelpFormatter(
        prog,
        max_help_position=50,
        width=min(shutil.get_terminal_size((80, 25)).columns, 120))


parser = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=my_formatter)
Esempio n. 7
0
class TestIndexController(IndexerTestCase):

    def setUp(self) -> None:
        super().setUp()
        self.index_service.create_indices(self.catalog)
        self.client = AzulClient()
        self.controller = IndexController()
        # noinspection PyPropertyAccess
        self.controller.index_service = self.index_service
        self.queue_manager = queues.Queues(delete=True)

    def tearDown(self):
        self.index_service.delete_indices(self.catalog)
        super().tearDown()

    def _create_mock_queues(self):
        sqs = boto3.resource('sqs')
        for queue_name in config.all_queue_names:
            sqs.create_queue(QueueName=queue_name)

    def _mock_sqs_record(self, **body):
        event_dict = {
            'body': json.dumps(body),
            'receiptHandle': 'ThisWasARandomString',
            'attributes': {'ApproximateReceiveCount': 1}
        }
        return SQSRecord(event_dict=event_dict, context='controller_test')

    def test_invalid_notification(self):
        event = [
            self._mock_sqs_record(action='foo',
                                  notification='bar',
                                  catalog=self.catalog)
        ]
        self.assertRaises(AssertionError, self.controller.contribute, event)

    def test_remote_reindex(self):
        event = [self._mock_sqs_record(action='reindex', prefix='ff')]
        with mock.patch.object(AzulClient, 'do_remote_reindex') as mock_reindex:
            mock_reindex.return_value = True
            self.controller.contribute(event)
            mock_reindex.assert_called_once_with(dict(action='reindex', prefix='ff'))

    def test_contribute_and_aggregate(self):
        """
        Contribution and aggregation of two bundles

        Index two bundles that make contributions to the same project. Inspect
        that the contributions match the tallies that are returned to SQS.
        During aggregation only the project entity is deferred due to
        multiple contributions.
        """
        self._create_mock_queues()
        event = []
        bundles = []
        expected_entities = set()
        bundle_fqids = [
            BundleFQID('56a338fe-7554-4b5d-96a2-7df127a7640b', '2018-03-28T151023.074974Z'),
            BundleFQID('b2216048-7eaa-45f4-8077-5a3fb4204953', '2018-03-29T104041.822717Z')
        ]
        for bundle_fqid in bundle_fqids:
            notification = self.client.synthesize_notification(self.catalog, bundle_fqid)
            event.append(self._mock_sqs_record(action='add',
                                               catalog=self.catalog,
                                               notification=notification))
            bundle = self._load_canned_bundle(bundle_fqid)
            bundles.append(bundle)
            # Invoke the service once to produce a set of expected entities so
            # we don't need to hard-code them. Keep in mind that this test is
            # not covering the service, only the controller.
            contributions = self.index_service.transform(self.catalog, bundle, delete=False)
            expected_entities.update(
                (c.entity.entity_id, c.entity.entity_type)
                for c in contributions
            )

        mock_plugin = mock.MagicMock()
        mock_plugin.fetch_bundle.side_effect = bundles
        with mock.patch.object(IndexController,
                               'repository_plugin',
                               return_value=mock_plugin):
            # Test contribution
            self.controller.contribute(event)
            tallies = [
                json.loads(m.body)
                for m in self.queue_manager.read_messages(self.controller._tallies_queue())
            ]
            entities_from_tallies = {
                (t['entity_id'], t['entity_type'])
                for t in tallies
            }
            self.assertSetEqual(expected_entities, entities_from_tallies)
            self.assertListEqual([mock.call(f) for f in bundle_fqids],
                                 mock_plugin.fetch_bundle.mock_calls)

            # Test aggregation for tallies, inspect for deferred tallies
            event = [self._mock_sqs_record(**t) for t in tallies]
            self.controller.aggregate(event)
            messages = self.queue_manager.read_messages(self.controller._tallies_queue())

            # Check that aggregation of project entity was deferred
            project_tally = json.loads(one(messages).body)
            expected_tally = {
                'catalog': 'test',
                'entity_type': 'projects',
                'entity_id': '93f6a42f-1790-4af4-b5d1-8c436cb6feae',
                'num_contributions': 2
            }
            self.assertDictEqual(project_tally, expected_tally)

            # Test aggregation of deferred project entity
            event = [self._mock_sqs_record(**project_tally)]
            self.controller.aggregate(event)
            messages = self.queue_manager.read_messages(self.controller._tallies_queue())
            self.assertEqual(0, len(messages))
Esempio n. 8
0
def main(argv: List[str]):
    args = parser.parse_args(argv)

    if args.verbose:
        config.debug = 1

    configure_script_logging(logger)

    azul = AzulClient(num_workers=args.num_workers)

    source_globs = set(args.sources)
    if not args.local or args.deindex:
        sources_by_catalog = defaultdict(set)
        globs_matched = set()
        for catalog in args.catalogs:
            sources = azul.catalog_sources(catalog)
            for source_glob in source_globs:
                matches = fnmatch.filter(sources, source_glob)
                if matches:
                    globs_matched.add(source_glob)
                logger.debug('Source glob %r matched sources %r in catalog %r',
                             source_glob, matches, catalog)
                sources_by_catalog[catalog].update(matches)
        unmatched = source_globs - globs_matched
        if unmatched:
            logger.warning('Source(s) not found in any catalog: %r', unmatched)
        require(any(sources_by_catalog.values()),
                'No valid sources specified for any catalog')
    else:
        if source_globs == {'*'}:
            sources_by_catalog = {
                catalog: azul.catalog_sources(catalog)
                for catalog in args.catalogs
            }
        else:
            parser.error('Cannot specify sources when performing a local reindex')
            assert False

    if args.deindex:
        require(not any((args.index, args.delete, args.create)),
                '--deindex is incompatible with --index, --create, and --delete.')
        require('*' not in source_globs, '--deindex is incompatible with source `*`. '
                                         'Use --delete instead.')

        for catalog, sources in sources_by_catalog.items():
            if sources:
                azul.deindex(catalog, sources)

    azul.reset_indexer(args.catalogs,
                       purge_queues=args.purge,
                       delete_indices=args.delete,
                       create_indices=args.create or args.index and args.delete)

    if args.index:
        logger.info('Queuing notifications for reindexing ...')
        reservation = None
        num_notifications = 0
        for catalog, sources in sources_by_catalog.items():
            if sources:
                if (
                    args.manage_slots
                    and reservation is None
                    and isinstance(azul.repository_plugin(catalog), tdr.Plugin)
                ):
                    reservation = BigQueryReservation()
                    reservation.activate()
                if not args.local:
                    azul.remote_reindex(catalog, sources)
                    num_notifications = None
                else:
                    num_notifications += azul.reindex(catalog, args.prefix)
            else:
                logger.info('Skipping catalog %r (no matching sources)', catalog)
        if args.wait:
            if num_notifications == 0:
                logger.warning('No notifications for prefix %r and catalogs %r were sent',
                               args.prefix, args.catalogs)
            else:
                azul.wait_for_indexer()
Esempio n. 9
0
 def azul_client(self):
     return AzulClient(prefix=self.bundle_uuid_prefix)