Beispiel #1
0
def fetcher(ctx,
            xmlrpc,
            xmlrpc_host,
            xmlrpc_port,
            poolsize,
            proxy,
            user_agent,
            timeout,
            Fetcher=Fetcher):
    g = ctx.obj
    fetcher = Fetcher(inqueue=g.scheduler2fetcher,
                      outqueue=g.fetcher2processor,
                      poolsize=poolsize,
                      proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = dict(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #2
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
    fetcher.run()
Beispiel #3
0
def webui(ctx,
          host,
          port,
          cdn,
          scheduler_rpc,
          fetcher_rpc,
          max_rate,
          max_burst,
          username,
          password,
          need_auth,
          app=app):
    g = ctx.obj
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['cdn'] = cdn

    if max_rate:
        app.config['max_rate'] = max_rate
    if max_burst:
        app.config['max_burst'] = max_burst
    if username:
        app.config['webui_username'] = username
    if password:
        app.config['webui_password'] = password

    # fetcher rpc
    if isinstance(fetcher_rpc, six.string_types):
        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
    if fetcher_rpc is None:
        fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
        fetcher.phantomjs_proxy = g.phantomjs_proxy
        app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    else:
        import umsgpack
        app.config['fetch'] = lambda x: umsgpack.unpackb(
            fetcher_rpc.fetch(x).data)

    if isinstance(scheduler_rpc, six.string_types):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        app.config['scheduler_rpc'] = connect_rpc(
            ctx, None, 'http://%s/' %
            (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    elif scheduler_rpc is None:
        app.config['scheduler_rpc'] = connect_rpc(ctx, None,
                                                  'http://localhost:23333/')
    else:
        app.config['scheduler_rpc'] = scheduler_rpc

    app.debug = g.debug
    g.instances.append(app)
    if g.get('testing_mode'):
        return app

    app.run(host=host, port=port)
Beispiel #4
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port):
    g = ctx.obj
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    g.instances.append(fetcher)

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #5
0
def run_fetcher(g=g):
    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher,
                      outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run,
                  port=g.fetcher_xmlrpc_port,
                  bind=g.webui_host)
    fetcher.run()
Beispiel #6
0
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc,
          max_rate, max_burst, username, password):
    g = ctx.obj
    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['cdn'] = cdn

    if max_rate:
        app.config['max_rate'] = max_rate
    if max_burst:
        app.config['max_burst'] = max_burst
    if username:
        app.config['webui_username'] = username
    if password:
        app.config['webui_password'] = password

    # fetcher rpc
    if isinstance(fetcher_rpc, basestring):
        fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc)
    if fetcher_rpc is None:
        from pyspider.fetcher.tornado_fetcher import Fetcher
        fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
        fetcher.phantomjs_proxy = g.phantomjs_proxy
        app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    else:
        import umsgpack
        app.config['fetch'] = lambda x: umsgpack.unpackb(fetcher_rpc.fetch(x).data)

    if isinstance(scheduler_rpc, basestring):
        scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc)
    if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'):
        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://%s/' % (
            os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):]))
    elif scheduler_rpc is None:
        app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/')
    else:
        app.config['scheduler_rpc'] = scheduler_rpc

    app.debug = g.debug
    if g.get('testing_mode'):
        return app

    app.run(host=host, port=port)
Beispiel #7
0
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher):
    g = ctx.obj
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor,
                      poolsize=poolsize, proxy=proxy)
    fetcher.phantomjs_proxy = g.phantomjs_proxy
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = dict(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode'):
        return fetcher

    if xmlrpc:
        run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()
Beispiel #8
0
def run_webui(g=g):
    import cPickle as pickle

    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    app.config['scheduler_rpc'] = g.scheduler_rpc
    #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
    if g.demo_mode:
        app.config['max_rate'] = 0.2
        app.config['max_burst'] = 3.0
    if 'WEBUI_USERNAME' in os.environ:
        app.config['webui_username'] = os.environ['WEBUI_USERNAME']
        app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
    if not getattr(g, 'all_in_one', False):
        app.debug = g.debug
    app.run(host=g.webui_host, port=g.webui_port)
Beispiel #9
0
def run_webui(g=g):
    import cPickle as pickle

    from pyspider.fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    from pyspider.webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    app.config['scheduler_rpc'] = g.scheduler_rpc
    #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
    if g.demo_mode:
        app.config['max_rate'] = 0.2
        app.config['max_burst'] = 3.0
    if 'WEBUI_USERNAME' in os.environ:
        app.config['webui_username'] = os.environ['WEBUI_USERNAME']
        app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
    if not getattr(g, 'all_in_one', False):
        app.debug = g.debug
    app.run(host=g.webui_host, port=g.webui_port)