Beispiel #1
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher,
                      outqueue=g.fetcher2processor)
    g.fetcher = fetcher
    run_in_thread(fetcher.xmlrpc_run)
    fetcher.run()
Beispiel #2
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Beispiel #3
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      host='0.0.0.0',
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(
            socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--bind=0.0.0.0',
            '--password=123456', '--port=14830', '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
Beispiel #4
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--password=123456', '--port=14830',
            '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Beispiel #5
0
def fetcher(ctx,
            xmlrpc,
            xmlrpc_host,
            xmlrpc_port,
            poolsize,
            proxy,
            user_agent,
            timeout,
            Fetcher=Fetcher):
    g = ctx.obj
    fetcher = Fetcher(inqueue=g.scheduler2fetcher,
                      outqueue=g.fetcher2processor,
                      poolsize=poolsize,
                      proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = dict(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #6
0
 def setUpClass(self):
     self.fetcher = Fetcher(None, None, async_mode=False)
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                   port=14887,
                                                   passthrough_errors=False)
     self.httpbin = 'http://127.0.0.1:14887'
     time.sleep(0.5)
Beispiel #7
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                              port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
Beispiel #8
0
def webui(ctx,
          host,
          port,
          cdn,
          scheduler_rpc,
          fetcher_rpc,
          max_rate,
          max_burst,
          username,
          password,
          need_auth,
          app=app):
    g = ctx.obj
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['cdn'] = cdn

    if max_rate:
        app.config['max_rate'] = max_rate
    if max_burst:
        app.config['max_burst'] = max_burst
    if username:
        app.config['webui_username'] = username
    if password:
        app.config['webui_password'] = password

    # fetcher rpc
    if isinstance(fetcher_rpc, six.string_types):
        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
    if fetcher_rpc is None:
        fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
        fetcher.phantomjs_proxy = g.phantomjs_proxy
        app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    else:
        import umsgpack
        app.config['fetch'] = lambda x: umsgpack.unpackb(
            fetcher_rpc.fetch(x).data)

    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        app.config['scheduler_rpc'] = connect_rpc(
            ctx, None, 'http://%s/' %
            (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    elif scheduler_rpc is None:
        app.config['scheduler_rpc'] = connect_rpc(ctx, None,
                                                  'http://localhost:23333/')
    else:
        app.config['scheduler_rpc'] = scheduler_rpc

    app.debug = g.debug
    g.instances.append(app)
    if g.get('testing_mode'):
        return app

    app.run(host=host, port=port)
Beispiel #9
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher,
                      outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run,
                  port=g.fetcher_xmlrpc_port,
                  bind=g.webui_host)
    fetcher.run()
Beispiel #10
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.fetcher.phantomjs_proxy = 'localhost:25555'
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
     self.phantomjs = subprocess.Popen(['phantomjs',
         os.path.join(os.path.dirname(__file__),
             '../pyspider/fetcher/phantomjs_fetcher.js'),
         '25555'])
Beispiel #11
0
def run_webui(g=g):
    import cPickle as pickle

    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    app.config['scheduler_rpc'] = g.scheduler_rpc
    #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
    if g.demo_mode:
        app.config['max_rate'] = 0.2
        app.config['max_burst'] = 3.0
    if 'WEBUI_USERNAME' in os.environ:
        app.config['webui_username'] = os.environ['WEBUI_USERNAME']
        app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
    if not getattr(g, 'all_in_one', False):
        app.debug = g.debug
    app.run(host=g.webui_host, port=g.webui_port)