Esempio n. 1
0
 def __init__(self, args, unit_test=False):
     self._args = args
     self._factory = Factory({
         'Application': Application,
         'BatchDocumentConverter': BatchDocumentConverter,
         'BandwidthLimiter': BandwidthLimiter,
         'HTTPClient': HTTPClient,
         'CookieJar': CookieJar,
         'CookieJarWrapper': CookieJarWrapper,
         'CookiePolicy': DeFactoCookiePolicy,
         'ConnectionPool': ConnectionPool,
         'CSSScraper': CSSScraper,
         'DemuxDocumentScraper': DemuxDocumentScraper,
         'DemuxURLFilter': DemuxURLFilter,
         'FTPProcessor': FTPProcessor,
         'ElementWalker': ElementWalker,
         'FetchRule': FetchRule,
         'FileWriter': NullWriter,
         'FTPClient': FTPClient,
         'FTPProcessorFetchParams': FTPProcessorFetchParams,
         'HTTPProxyServer': HTTPProxyServer,
         'HTMLParser': NotImplemented,
         'HTMLScraper': HTMLScraper,
         'JavaScriptScraper': JavaScriptScraper,
         'PathNamer': PathNamer,
         'PhantomJSDriver': PhantomJSDriver,
         'PhantomJSCoprocessor': PhantomJSCoprocessor,
         'PipelineSeries': PipelineSeries,
         'ProcessingRule': ProcessingRule,
         'Processor': DelegateProcessor,
         'ProxyCoprocessor': ProxyCoprocessor,
         'ProxyHostFilter': ProxyHostFilter,
         'RedirectTracker': RedirectTracker,
         'Request': Request,
         'Resolver': Resolver,
         'ResourceMonitor': ResourceMonitor,
         'ResultRule': ResultRule,
         'RobotsTxtChecker': RobotsTxtChecker,
         'RobotsTxtPool': RobotsTxtPool,
         'SitemapScraper': SitemapScraper,
         'Statistics': Statistics,
         'URLInfo': URLInfo,
         'URLTable': URLTableHookWrapper,
         'URLTableImplementation': SQLURLTable,
         'URLRewriter': URLRewriter,
         'Waiter': LinearWaiter,
         'WARCRecorder': WARCRecorder,
         'WebClient': WebClient,
         'WebProcessor': WebProcessor,
         'WebProcessorFetchParams': WebProcessorFetchParams,
         'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
     })
     self._unit_test = unit_test
Esempio n. 2
0
 def __init__(self, args, unit_test=False):
     self._args = args
     self._factory = Factory({
         'Application': Application,
         'BatchDocumentConverter': BatchDocumentConverter,
         'BandwidthLimiter': BandwidthLimiter,
         'HTTPClient': HTTPClient,
         'CookieJar': CookieJar,
         'CookieJarWrapper': CookieJarWrapper,
         'CookiePolicy': DeFactoCookiePolicy,
         'ConnectionPool': ConnectionPool,
         'CSSScraper': CSSScraper,
         'DemuxDocumentScraper': DemuxDocumentScraper,
         'DemuxURLFilter': DemuxURLFilter,
         'FTPProcessor': FTPProcessor,
         'ElementWalker': ElementWalker,
         'FetchRule': FetchRule,
         'FileWriter': NullWriter,
         'FTPClient': FTPClient,
         'FTPProcessorFetchParams': FTPProcessorFetchParams,
         'HTTPProxyServer': HTTPProxyServer,
         'HTMLParser': NotImplemented,
         'HTMLScraper': HTMLScraper,
         'JavaScriptScraper': JavaScriptScraper,
         'PathNamer': PathNamer,
         'PhantomJSDriver': PhantomJSDriver,
         'PhantomJSCoprocessor': PhantomJSCoprocessor,
         'PipelineSeries': PipelineSeries,
         'ProcessingRule': ProcessingRule,
         'Processor': DelegateProcessor,
         'ProxyCoprocessor': ProxyCoprocessor,
         'ProxyHostFilter': ProxyHostFilter,
         'RedirectTracker': RedirectTracker,
         'Request': Request,
         'Resolver': Resolver,
         'ResourceMonitor': ResourceMonitor,
         'ResultRule': ResultRule,
         'RobotsTxtChecker': RobotsTxtChecker,
         'RobotsTxtPool': RobotsTxtPool,
         'SitemapScraper': SitemapScraper,
         'Statistics': Statistics,
         'URLInfo': URLInfo,
         'URLTable': URLTableHookWrapper,
         'URLTableImplementation': SQLURLTable,
         'URLRewriter': URLRewriter,
         'Waiter': LinearWaiter,
         'WARCRecorder': WARCRecorder,
         'WebClient': WebClient,
         'WebProcessor': WebProcessor,
         'WebProcessorFetchParams': WebProcessorFetchParams,
         'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
     })
     self._unit_test = unit_test
Esempio n. 3
0
    def test_factory(self):
        factory = Factory()
        factory.set('dict', dict)

        self.assertNotIn('dict', factory)

        self.assertFalse(factory.is_all_initialized())

        my_instance = factory.new('dict', [('hi', 'hello')])

        self.assertIn('dict', factory)
        self.assertEqual(my_instance, factory['dict'])
        self.assertTrue(factory.is_all_initialized())
        self.assertEqual(1, len(factory))
        self.assertEqual(['dict'], list(iter(factory)))
        self.assertEqual(my_instance, factory.instance_map['dict'])

        with self.assertRaises(ValueError):
            factory.new('dict', [('hi', 'hello')])
Esempio n. 4
0
class Builder(object):
    '''Application builder.

    Args:
        args: Options from :class:`argparse.ArgumentParser`
    '''
    def __init__(self, args, unit_test=False):
        self._args = args
        self._factory = Factory({
            'Application': Application,
            'BandwidthLimiter': BandwidthLimiter,
            'HTTPClient': HTTPClient,
            'CookieJar': CookieJar,
            'CookieJarWrapper': CookieJarWrapper,
            'CookiePolicy': DeFactoCookiePolicy,
            'ConnectionPool': ConnectionPool,
            'CSSScraper': CSSScraper,
            'DemuxDocumentScraper': DemuxDocumentScraper,
            'DemuxURLFilter': DemuxURLFilter,
            'FTPProcessor': FTPProcessor,
            'ElementWalker': ElementWalker,
            'FetchRule': FetchRule,
            'FileWriter': NullWriter,
            'FTPClient': FTPClient,
            'FTPProcessorFetchParams': FTPProcessorFetchParams,
            'HTTPProxyServer': HTTPProxyServer,
            'HTMLParser': NotImplemented,
            'HTMLScraper': HTMLScraper,
            'JavaScriptScraper': JavaScriptScraper,
            'PathNamer': PathNamer,
            'PipelineSeries': PipelineSeries,
            'ProcessingRule': ProcessingRule,
            'Processor': DelegateProcessor,
            'ProxyCoprocessor': ProxyCoprocessor,
            'ProxyHostFilter': ProxyHostFilter,
            'RedirectTracker': RedirectTracker,
            'Request': Request,
            'Resolver': Resolver,
            'ResourceMonitor': ResourceMonitor,
            'ResultRule': ResultRule,
            'RobotsTxtChecker': RobotsTxtChecker,
            'RobotsTxtPool': RobotsTxtPool,
            'SitemapScraper': SitemapScraper,
            'Statistics': Statistics,
            'URLInfo': URLInfo,
            'URLTable': URLTableHookWrapper,
            'URLTableImplementation': SQLURLTable,
            'URLRewriter': URLRewriter,
            'Waiter': LinearWaiter,
            'WARCRecorder': WARCRecorder,
            'WebClient': WebClient,
            'WebProcessor': WebProcessor,
            'WebProcessorFetchParams': WebProcessorFetchParams,
            'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
        })
        self._unit_test = unit_test

    @property
    def factory(self):
        '''Return the Factory.

        Returns:
            Factory: An :class:`.factory.Factory` instance.
        '''
        return self._factory

    def build(self) -> Application:
        '''Put the application together.
        '''

        pipelines = self._build_pipelines()
        self._factory.new('Application', pipelines)

        return self._factory['Application']

    def _build_pipelines(self) -> PipelineSeries:
        app_session = AppSession(self._factory, self._args, self.get_stderr())

        app_start_pipeline = Pipeline(AppSource(app_session), [
            LoggingSetupTask(),
            PluginSetupTask(),
            DatabaseSetupTask(),
            ParserSetupTask(),
            WARCVisitsTask(),
            SSLContextTask(),
            ResmonSetupTask(),
            StatsStartTask(),
            URLFiltersSetupTask(),
            NetworkSetupTask(),
            ClientSetupTask(),
            WARCRecorderSetupTask(),
            FileWriterSetupTask(),
            ProcessorSetupTask(),
            ProxyServerSetupTask(),
            CoprocessorSetupTask(),
            InputURLTask(),
            URLFiltersPostURLImportSetupTask(),
        ])

        url_item_source = URLItemSource(app_session)

        download_pipeline = Pipeline(url_item_source, [
            ProcessTask(),
            ResmonSleepTask(),
            BackgroundAsyncTask(),
            CheckQuotaTask(),
        ])

        download_stop_pipeline = Pipeline(AppSource(app_session),
                                          [StatsStopTask()])
        download_stop_pipeline.skippable = True

        app_stop_pipeline = Pipeline(AppSource(app_session), [
            BackgroundAsyncCleanupTask(),
            AppStopTask(),
            WARCRecorderTeardownTask(),
            CookieJarTeardownTask(),
            LoggingShutdownTask(),
        ])

        pipeline_series = self._factory.new(
            'PipelineSeries', (app_start_pipeline, download_pipeline,
                               download_stop_pipeline, app_stop_pipeline))
        pipeline_series.concurrency_pipelines.add(download_pipeline)

        return pipeline_series

    def build_and_run(self):
        '''Build and run the application.

        Returns:
            int: The exit status.
        '''
        app = self.build()
        exit_code = app.run_sync()
        return exit_code

    def get_stderr(self):
        '''Return stderr or something else if under unit testing.'''
        if self._unit_test:
            return sys.stdout
        else:
            return sys.stderr
Esempio n. 5
0
class Builder(object):
    '''Application builder.

    Args:
        args: Options from :class:`argparse.ArgumentParser`
    '''
    def __init__(self, args, unit_test=False):
        self._args = args
        self._factory = Factory({
            'Application': Application,
            'BatchDocumentConverter': BatchDocumentConverter,
            'BandwidthLimiter': BandwidthLimiter,
            'HTTPClient': HTTPClient,
            'CookieJar': CookieJar,
            'CookieJarWrapper': CookieJarWrapper,
            'CookiePolicy': DeFactoCookiePolicy,
            'ConnectionPool': ConnectionPool,
            'CSSScraper': CSSScraper,
            'DemuxDocumentScraper': DemuxDocumentScraper,
            'DemuxURLFilter': DemuxURLFilter,
            'FTPProcessor': FTPProcessor,
            'ElementWalker': ElementWalker,
            'FetchRule': FetchRule,
            'FileWriter': NullWriter,
            'FTPClient': FTPClient,
            'FTPProcessorFetchParams': FTPProcessorFetchParams,
            'HTTPProxyServer': HTTPProxyServer,
            'HTMLParser': NotImplemented,
            'HTMLScraper': HTMLScraper,
            'JavaScriptScraper': JavaScriptScraper,
            'PathNamer': PathNamer,
            'PhantomJSDriver': PhantomJSDriver,
            'PhantomJSCoprocessor': PhantomJSCoprocessor,
            'PipelineSeries': PipelineSeries,
            'ProcessingRule': ProcessingRule,
            'Processor': DelegateProcessor,
            'ProxyCoprocessor': ProxyCoprocessor,
            'ProxyHostFilter': ProxyHostFilter,
            'RedirectTracker': RedirectTracker,
            'Request': Request,
            'Resolver': Resolver,
            'ResourceMonitor': ResourceMonitor,
            'ResultRule': ResultRule,
            'RobotsTxtChecker': RobotsTxtChecker,
            'RobotsTxtPool': RobotsTxtPool,
            'SitemapScraper': SitemapScraper,
            'Statistics': Statistics,
            'URLInfo': URLInfo,
            'URLTable': URLTableHookWrapper,
            'URLTableImplementation': SQLURLTable,
            'URLRewriter': URLRewriter,
            'Waiter': LinearWaiter,
            'WARCRecorder': WARCRecorder,
            'WebClient': WebClient,
            'WebProcessor': WebProcessor,
            'WebProcessorFetchParams': WebProcessorFetchParams,
            'YoutubeDlCoprocessor': YoutubeDlCoprocessor,
        })
        self._unit_test = unit_test

    @property
    def factory(self):
        '''Return the Factory.

        Returns:
            Factory: An :class:`.factory.Factory` instance.
        '''
        return self._factory

    def build(self) -> Application:
        '''Put the application together.
        '''

        pipelines = self._build_pipelines()
        self._factory.new('Application', pipelines)

        return self._factory['Application']

    def _build_pipelines(self) -> PipelineSeries:
        app_session = AppSession(self._factory, self._args, self.get_stderr())

        app_start_pipeline = Pipeline(
            AppSource(app_session),
            [
                LoggingSetupTask(),
                DatabaseSetupTask(),
                ParserSetupTask(),
                WARCVisitsTask(),
                SSLContextTask(),
                ResmonSetupTask(),
                StatsStartTask(),
                URLFiltersSetupTask(),
                NetworkSetupTask(),
                ClientSetupTask(),
                WARCRecorderSetupTask(),
                FileWriterSetupTask(),
                ProcessorSetupTask(),
                ProxyServerSetupTask(),
                CoprocessorSetupTask(),
                LinkConversionSetupTask(),
                PluginSetupTask(),
                InputURLTask(),
                URLFiltersPostURLImportSetupTask(),
            ])

        url_item_source = URLItemSource(app_session)

        download_pipeline = Pipeline(
            url_item_source,
            [
                ProcessTask(),
                ResmonSleepTask(),
                BackgroundAsyncTask(),
                CheckQuotaTask(),
            ]
        )

        download_stop_pipeline = Pipeline(
            AppSource(app_session),
            [
                StatsStopTask()
            ])
        download_stop_pipeline.skippable = True

        queued_file_source = QueuedFileSource(app_session)

        conversion_pipeline = Pipeline(
            queued_file_source,
            [
                LinkConversionTask()
            ]
        )
        conversion_pipeline.skippable = True

        app_stop_pipeline = Pipeline(
            AppSource(app_session),
            [
                BackgroundAsyncCleanupTask(),
                AppStopTask(),
                WARCRecorderTeardownTask(),
                CookieJarTeardownTask(),
                LoggingShutdownTask(),
            ])

        pipeline_series = self._factory.new(
            'PipelineSeries',
            (
                app_start_pipeline, download_pipeline,
                download_stop_pipeline, conversion_pipeline, app_stop_pipeline
            ))
        pipeline_series.concurrency_pipelines.add(download_pipeline)

        return pipeline_series

    def build_and_run(self):
        '''Build and run the application.

        Returns:
            int: The exit status.
        '''
        app = self.build()
        exit_code = app.run_sync()
        return exit_code

    def get_stderr(self):
        '''Return stderr or something else if under unit testing.'''
        if self._unit_test:
            return sys.stdout
        else:
            return sys.stderr