Ejemplo n.º 1
0
 def test_mapreduce_over_queryset(self):
     pipeline = map_reduce_queryset(
         TestModel.objects.all(),
         yield_letters,
         reduce_count,
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={'bucket_name': 'test-bucket'})
     self.process_task_queues()
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
Ejemplo n.º 2
0
 def test_filtering(self):
     counter = Counter.objects.create()
     pipeline = map_queryset(TestModel.objects.filter(is_true=True),
                             count,
                             finalize_func=delete,
                             counter_id=counter.pk)
     counter = Counter.objects.create()
     self.process_task_queues()
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
     counter.refresh_from_db()
     self.assertEqual(0, counter.count)
Ejemplo n.º 3
0
 def test_mapreduce_over_entities(self):
     pipeline = map_reduce_entities(
         TestModel._meta.db_table,
         connection.settings_dict["NAMESPACE"],
         yield_letters,
         reduce_count,
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={'bucket_name': 'test-bucket'})
     self.process_task_queues()
     # Refetch the pipeline record
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
Ejemplo n.º 4
0
 def test_mapreduce_over_queryset(self):
     pipeline = map_reduce_queryset(
         TestModel.objects.all(),
         yield_letters,
         reduce_count,
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={
             'bucket_name': 'test-bucket'
         }
     )
     self.process_task_queues()
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
Ejemplo n.º 5
0
 def test_filtering(self):
     counter = Counter.objects.create()
     pipeline = map_queryset(
         TestModel.objects.filter(is_true=True),
         count,
         finalize_func=delete,
         counter_id=counter.pk
     )
     counter = Counter.objects.create()
     self.process_task_queues()
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
     counter.refresh_from_db()
     self.assertEqual(0, counter.count)
Ejemplo n.º 6
0
 def test_mapreduce_over_entities(self):
     pipeline = map_reduce_entities(
         TestModel._meta.db_table,
         connection.settings_dict["NAMESPACE"],
         yield_letters,
         reduce_count,
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={
             'bucket_name': 'test-bucket'
         }
     )
     self.process_task_queues()
     # Refetch the pipeline record
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
Ejemplo n.º 7
0
    def test_filters_apply(self):
        counter = Counter.objects.create()

        pipeline = map_queryset(TestModel.objects.filter(pk__gt=2),
                                count,
                                finalize_func=delete,
                                counter_id=counter.pk)

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(3, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 8
0
    def test_mapping_over_entities(self):
        counter = Counter.objects.create()

        pipeline = map_entities(TestModel._meta.db_table,
                                connection.settings_dict['NAMESPACE'],
                                count_entity,
                                finalize_func=delete,
                                counter_id=counter.pk)

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(5, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 9
0
    def test_filters_apply(self):
        counter = Counter.objects.create()

        pipeline = map_queryset(
            TestModel.objects.filter(pk__gt=2),
            count,
            finalize_func=delete,
            counter_id=counter.pk
        )

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(3, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 10
0
    def test_map_over_files(self):
        storage = CloudStorage()
        storage.save('a/b/c/tmp1', ContentFile('abcdefghi'))
        storage.save('c/tmp2', ContentFile('not matching pattern'))
        storage.save('a/d/tmp3', ContentFile('xxx'))

        counter = Counter.objects.create()
        pipeline = map_files('test_bucket',
                             count_contents,
                             filenames=['a/*'],
                             counter_id=counter.pk)

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()
        self.assertEqual(12, counter.count)
Ejemplo n.º 11
0
    def test_mapping_over_entities(self):
        counter = Counter.objects.create()

        pipeline = map_entities(
            TestModel._meta.db_table,
            connection.settings_dict['NAMESPACE'],
            count_entity,
            finalize_func=delete,
            counter_id=counter.pk
        )

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(5, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 12
0
    def test_map_over_files(self):
        storage = CloudStorage()
        storage.save('a/b/c/tmp1', ContentFile('abcdefghi'))
        storage.save('c/tmp2', ContentFile('not matching pattern'))
        storage.save('a/d/tmp3', ContentFile('xxx'))

        counter = Counter.objects.create()
        pipeline = map_files(
            'test_bucket',
            count_contents,
            filenames=['a/*'],
            counter_id=counter.pk
        )

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()
        self.assertEqual(12, counter.count)
Ejemplo n.º 13
0
 def test_filters(self):
     """ Passing the `_filters` kwarg to to `map_reduce_entities` should allow only some
         entities to be processed.
     """
     counter = Counter.objects.create()
     pipeline = map_reduce_entities(
         TestModel._meta.db_table,
         connection.settings_dict["NAMESPACE"],
         count_entity_to_default_counter,
         reduce_count,  # This is a no-op because count_entity doesn't return anything
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={'bucket_name': 'test-bucket'},
         _filters=[("text", "=", "abcc-3")])
     self.process_task_queues()
     # Refetch the pipeline record
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
     # We expect only the one entity to have been counted
     counter.refresh_from_db()
     self.assertEqual(counter.count, 1)
Ejemplo n.º 14
0
    def test_slicing(self):
        counter = Counter.objects.create()

        pipeline = map_queryset(
            TestModel.objects.all(),
            slow_count,
            finalize_func=delete,
            counter_id=counter.pk,
            # mapreduce default slice duration is 15 seconds
            # slow down processing enough to split into two slices
            sleep_duration=4,
            _shards=1)

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(5, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 15
0
    def test_slicing(self):
        counter = Counter.objects.create()

        pipeline = map_queryset(
            TestModel.objects.all(),
            slow_count,
            finalize_func=delete,
            counter_id=counter.pk,
            # mapreduce default slice duration is 15 seconds
            # slow down processing enough to split into two slices
            sleep_duration=4,
            _shards=1
        )

        self.process_task_queues()
        pipeline = get_pipeline_by_id(pipeline.pipeline_id)
        self.assertTrue(pipeline.has_finalized)
        counter.refresh_from_db()

        self.assertEqual(5, counter.count)
        self.assertFalse(TestModel.objects.count())
Ejemplo n.º 16
0
 def test_filters(self):
     """ Passing the `_filters` kwarg to to `map_reduce_entities` should allow only some
         entities to be processed.
     """
     counter = Counter.objects.create()
     pipeline = map_reduce_entities(
         TestModel._meta.db_table,
         connection.settings_dict["NAMESPACE"],
         count_entity_to_default_counter,
         reduce_count,  # This is a no-op because count_entity doesn't return anything
         output_writers.GoogleCloudStorageKeyValueOutputWriter,
         _output_writer_kwargs={
             'bucket_name': 'test-bucket'
         },
         _filters=[("text", "=", "abcc-3")]
     )
     self.process_task_queues()
     # Refetch the pipeline record
     pipeline = get_pipeline_by_id(pipeline.pipeline_id)
     self.assertTrue(pipeline.has_finalized)
     # We expect only the one entity to have been counted
     counter.refresh_from_db()
     self.assertEqual(counter.count, 1)