Example #1
0
 def setUpClass(self):
     self.projectdb = ProjectDB([
         os.path.join(os.path.dirname(__file__),
                      'data_fetcher_processor_handler.py')
     ])
     self.fetcher = Fetcher(None, None, async=False)
     self.status_queue = Queue()
     self.newtask_queue = Queue()
     self.result_queue = Queue()
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                   port=14887,
                                                   passthrough_errors=False)
     self.httpbin = 'http://127.0.0.1:14887'
     self.proxy_thread = subprocess.Popen([
         'pyproxy', '--username=binux', '--password=123456', '--port=14830',
         '--debug'
     ],
                                          close_fds=True)
     self.proxy = '127.0.0.1:14830'
     self.processor = Processor(projectdb=self.projectdb,
                                inqueue=None,
                                status_queue=self.status_queue,
                                newtask_queue=self.newtask_queue,
                                result_queue=self.result_queue)
     self.project_name = 'data_fetcher_processor_handler'
     time.sleep(0.5)
Example #2
0
 def setUpClass(self):
     self.projectdb = ProjectDB([
         os.path.join(os.path.dirname(__file__),
                      'data_fetcher_processor_handler.py')
     ])
     self.fetcher = Fetcher(None, None, async=False)
     self.status_queue = Queue()
     self.newtask_queue = Queue()
     self.result_queue = Queue()
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                   port=14887)
     self.httpbin = 'http://127.0.0.1:14887'
     self.processor = Processor(projectdb=self.projectdb,
                                inqueue=None,
                                status_queue=self.status_queue,
                                newtask_queue=self.newtask_queue,
                                result_queue=self.result_queue)
     self.project_name = 'data_fetcher_processor_handler'
     time.sleep(0.5)
Example #3
0
def one(ctx, interactive, enable_phantomjs, enable_puppeteer, scripts):
    """
    One mode not only means all-in-one, it runs every thing in one process over
    tornado.ioloop, for debug purpose
    """

    ctx.obj['debug'] = False
    g = ctx.obj
    g['testing_mode'] = True

    if scripts:
        from pyspider.database.local.projectdb import ProjectDB
        g['projectdb'] = ProjectDB(scripts)
        if g.get('is_taskdb_default'):
            g['taskdb'] = connect_database('sqlite+taskdb://')
        if g.get('is_resultdb_default'):
            g['resultdb'] = None

    if enable_phantomjs:
        phantomjs_config = g.config.get('phantomjs', {})
        phantomjs_obj = ctx.invoke(phantomjs, **phantomjs_config)
        if phantomjs_obj:
            g.setdefault('phantomjs_proxy', '127.0.0.1:%s' % phantomjs_obj.port)
    else:
        phantomjs_obj = None

    if enable_puppeteer:
        puppeteer_config = g.config.get('puppeteer', {})
        puppeteer_obj = ctx.invoke(puppeteer, **puppeteer_config)
        if puppeteer_obj:
            g.setdefault('puppeteer_proxy', '127.0.0.1:%s' % puppeteer.port)
    else:
        puppeteer_obj = None

    result_worker_config = g.config.get('result_worker', {})
    if g.resultdb is None:
        result_worker_config.setdefault('result_cls',
                                        'pyspider.result.OneResultWorker')
    result_worker_obj = ctx.invoke(result_worker, **result_worker_config)

    processor_config = g.config.get('processor', {})
    processor_config.setdefault('enable_stdout_capture', False)
    processor_obj = ctx.invoke(processor, **processor_config)

    fetcher_config = g.config.get('fetcher', {})
    fetcher_config.setdefault('xmlrpc', False)
    fetcher_obj = ctx.invoke(fetcher, **fetcher_config)

    scheduler_config = g.config.get('scheduler', {})
    scheduler_config.setdefault('xmlrpc', False)
    scheduler_config.setdefault('scheduler_cls',
                                'pyspider.scheduler.OneScheduler')
    scheduler_obj = ctx.invoke(scheduler, **scheduler_config)

    scheduler_obj.init_one(ioloop=fetcher_obj.ioloop,
                           fetcher=fetcher_obj,
                           processor=processor_obj,
                           result_worker=result_worker_obj,
                           interactive=interactive)
    if scripts:
        for project in g.projectdb.projects:
            scheduler_obj.trigger_on_start(project)

    try:
        scheduler_obj.run()
    finally:
        scheduler_obj.quit()
        if phantomjs_obj:
            phantomjs_obj.quit()
        if puppeteer_obj:
            puppeteer_obj.quit()