Example #1
0
    def test_simple_items(self):
        items = self._new_items(4)
        pipeline = Pipeline(MySource(items), [MyItemTask()])

        yield from pipeline.process()

        self._check_item_values(items)
Example #2
0
    def test_simple_items(self):
        items = self._new_items(4)
        pipeline = Pipeline(MySource(items), [MyItemTask()])

        yield from pipeline.process()

        self._check_item_values(items)
Example #3
0
    def test_pipeline_skipping(self):
        source1 = MyItemSource([1, 2, 3])
        source2 = MyItemSource([4, 5, 6])
        source3 = MyItemSource([7, 8, 9])

        task1 = MyItemTask()

        pipeline1 = Pipeline(source1, [task1])
        pipeline2 = Pipeline(source2, [MyItemTask()])
        pipeline3 = Pipeline(source3, [MyItemTask()])

        pipeline2.skippable = True

        app = Application(PipelineSeries([pipeline1, pipeline2, pipeline3]))

        def callback(work_item):
            app.stop()

        task1.callback = callback

        yield from app.run()

        self.assertTrue(source1.values, 'unprocessed')
        self.assertTrue(source2.values, 'skipped')
        self.assertFalse(
            source3.values,
            'processed',
        )
Example #4
0
    def test_concurrency_under(self):
        items = self._new_items(100)
        item_queue = ItemQueue()
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], item_queue)
        pipeline.concurrency = 2

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(2, task.peak_work)
Example #5
0
    def test_concurrency_under(self):
        items = self._new_items(100)
        item_queue = ItemQueue()
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], item_queue)
        pipeline.concurrency = 2

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(2, task.peak_work)
Example #6
0
    def _build_pipelines(self) -> PipelineSeries:
        app_session = AppSession(self._factory, self._args, self.get_stderr())

        app_start_pipeline = Pipeline(AppSource(app_session), [
            LoggingSetupTask(),
            DatabaseSetupTask(),
            ParserSetupTask(),
            WARCVisitsTask(),
            SSLContextTask(),
            ResmonSetupTask(),
            StatsStartTask(),
            URLFiltersSetupTask(),
            NetworkSetupTask(),
            ClientSetupTask(),
            WARCRecorderSetupTask(),
            FileWriterSetupTask(),
            ProcessorSetupTask(),
            ProxyServerSetupTask(),
            CoprocessorSetupTask(),
            LinkConversionSetupTask(),
            PluginSetupTask(),
            InputURLTask(),
            URLFiltersPostURLImportSetupTask(),
        ])

        url_item_source = URLItemSource(app_session)

        download_pipeline = Pipeline(url_item_source, [
            ProcessTask(),
            ResmonSleepTask(),
            BackgroundAsyncTask(),
            CheckQuotaTask(),
        ])

        download_stop_pipeline = Pipeline(AppSource(app_session),
                                          [StatsStopTask()])
        download_stop_pipeline.skippable = True

        queued_file_source = QueuedFileSource(app_session)

        conversion_pipeline = Pipeline(queued_file_source,
                                       [LinkConversionTask()])
        conversion_pipeline.skippable = True

        app_stop_pipeline = Pipeline(AppSource(app_session), [
            BackgroundAsyncCleanupTask(),
            AppStopTask(),
            WARCRecorderTeardownTask(),
            CookieJarTeardownTask(),
            LoggingShutdownTask(),
        ])

        pipeline_series = self._factory.new(
            'PipelineSeries',
            (app_start_pipeline, download_pipeline, download_stop_pipeline,
             conversion_pipeline, app_stop_pipeline))
        pipeline_series.concurrency_pipelines.add(download_pipeline)

        return pipeline_series
Example #7
0
    def test_simple(self):
        source1 = MyItemSource([1, 2, 3])
        source2 = MyItemSource([4, 5, 6])

        pipeline1 = Pipeline(source1, [MyItemTask()])
        pipeline2 = Pipeline(source2, [MyItemTask()])

        app = Application(PipelineSeries([pipeline1, pipeline2]))

        exit_code = yield from app.run()

        self.assertEqual(0, exit_code)
Example #8
0
    def test_stopping(self):
        items = self._new_items(10)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task])

        def task_callback():
            if task.item_count == 5:
                pipeline.stop()

        task.callback = task_callback

        yield from pipeline.process()

        self.assertIsNone(items[-1].processed_value)
Example #9
0
    def test_stopping(self):
        items = self._new_items(10)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task])

        def task_callback():
            if task.item_count == 5:
                pipeline.stop()

        task.callback = task_callback

        yield from pipeline.process()

        self.assertIsNone(items[-1].processed_value)
Example #10
0
    def test_concurrency_step_up(self):
        items = self._new_items(100)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], ItemQueue())

        def task_callback():
            if task.item_count == 20:
                _logger.debug('Set concurrency 10')
                pipeline.concurrency = 10

        task.callback = task_callback

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(10, task.peak_work)
Example #11
0
    def test_concurrency_step_up(self):
        items = self._new_items(100)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], ItemQueue())

        def task_callback():
            if task.item_count == 20:
                _logger.debug('Set concurrency 10')
                pipeline.concurrency = 10

        task.callback = task_callback

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(10, task.peak_work)
Example #12
0
    def test_pipeline_series(self):
        items = self._new_items(100)
        item_queue = ItemQueue()
        task = MyItemTask()
        pipeline_1 = Pipeline(MySource(items), [task], item_queue)
        pipeline_2 = Pipeline(MySource(items), [task], item_queue)

        series = PipelineSeries((pipeline_1, pipeline_2))
        series.concurrency_pipelines.add(pipeline_2)

        self.assertEqual(1, series.concurrency)

        series.concurrency = 2

        self.assertEqual(2, series.concurrency)
        self.assertEqual(1, pipeline_1.concurrency)
        self.assertEqual(2, pipeline_2.concurrency)
Example #13
0
    def test_concurrency_zero(self):
        items = self._new_items(100)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], ItemQueue())
        pipeline.concurrency = 5

        def task_callback():
            if task.item_count == 10:
                _logger.debug('Set concurrency to 0')
                pipeline.concurrency = 0

                def callback():
                    _logger.debug('Set concurrency to 10')
                    pipeline.concurrency = 10

                asyncio.get_event_loop().call_later(0.5, callback)

        task.callback = task_callback

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(10, task.peak_work)
Example #14
0
    def test_concurrency_zero(self):
        items = self._new_items(100)
        task = MyItemTask()
        pipeline = Pipeline(MySource(items), [task], ItemQueue())
        pipeline.concurrency = 5

        def task_callback():
            if task.item_count == 10:
                _logger.debug('Set concurrency to 0')
                pipeline.concurrency = 0

                def callback():
                    _logger.debug('Set concurrency to 10')
                    pipeline.concurrency = 10

                asyncio.get_event_loop().call_later(0.5, callback)

        task.callback = task_callback

        yield from pipeline.process()

        self._check_item_values(items)
        self.assertEqual(10, task.peak_work)
Example #15
0
    def test_exit_codes(self):
        for error_class, expected_exit_code in Application.ERROR_CODE_MAP.items(
        ):
            with self.subTest(error_class):
                source = MyItemSource([1, 2, 3])

                def callback(work_item):
                    raise error_class(work_item)

                task = MyItemTask(callback=callback)
                pipeline = Pipeline(source, [task])
                app = Application(PipelineSeries([pipeline]))

                exit_code = yield from app.run()

                self.assertEqual(expected_exit_code, exit_code)
Example #16
0
    def test_item_task_error(self):
        items = self._new_items(4)
        pipeline = Pipeline(MySource(items), [MyItemTask(test_error=True)])

        with self.assertRaises(MyItemTaskError):
            yield from pipeline.process()
Example #17
0
    def test_item_task_error(self):
        items = self._new_items(4)
        pipeline = Pipeline(MySource(items), [MyItemTask(test_error=True)])

        with self.assertRaises(MyItemTaskError):
            yield from pipeline.process()
Example #18
0
    def _build_pipelines(self) -> PipelineSeries:
        app_session = AppSession(self._factory, self._args, self.get_stderr())

        app_start_pipeline = Pipeline(
            AppSource(app_session),
            [
                LoggingSetupTask(),
                DatabaseSetupTask(),
                ParserSetupTask(),
                WARCVisitsTask(),
                SSLContextTask(),
                ResmonSetupTask(),
                StatsStartTask(),
                URLFiltersSetupTask(),
                NetworkSetupTask(),
                ClientSetupTask(),
                WARCRecorderSetupTask(),
                FileWriterSetupTask(),
                ProcessorSetupTask(),
                ProxyServerSetupTask(),
                CoprocessorSetupTask(),
                LinkConversionSetupTask(),
                PluginSetupTask(),
                InputURLTask(),
                URLFiltersPostURLImportSetupTask(),
            ])

        url_item_source = URLItemSource(app_session)

        download_pipeline = Pipeline(
            url_item_source,
            [
                ProcessTask(),
                ResmonSleepTask(),
                BackgroundAsyncTask(),
                CheckQuotaTask(),
            ]
        )

        download_stop_pipeline = Pipeline(
            AppSource(app_session),
            [
                StatsStopTask()
            ])
        download_stop_pipeline.skippable = True

        queued_file_source = QueuedFileSource(app_session)

        conversion_pipeline = Pipeline(
            queued_file_source,
            [
                LinkConversionTask()
            ]
        )
        conversion_pipeline.skippable = True

        app_stop_pipeline = Pipeline(
            AppSource(app_session),
            [
                BackgroundAsyncCleanupTask(),
                AppStopTask(),
                WARCRecorderTeardownTask(),
                CookieJarTeardownTask(),
                LoggingShutdownTask(),
            ])

        pipeline_series = self._factory.new(
            'PipelineSeries',
            (
                app_start_pipeline, download_pipeline,
                download_stop_pipeline, conversion_pipeline, app_stop_pipeline
            ))
        pipeline_series.concurrency_pipelines.add(download_pipeline)

        return pipeline_series