def run_fetcher(g=g): from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) g.fetcher = fetcher run_in_thread(fetcher.xmlrpc_run) fetcher.run()
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname( socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--bind=0.0.0.0', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830'
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None time.sleep(0.5)
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent, timeout, Fetcher=Fetcher): g = ctx.obj fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor, poolsize=poolsize, proxy=proxy) fetcher.phantomjs_proxy = g.phantomjs_proxy if user_agent: fetcher.user_agent = user_agent if timeout: fetcher.default_options = dict(fetcher.default_options) fetcher.default_options['timeout'] = timeout g.instances.append(fetcher) if g.get('testing_mode'): return fetcher if xmlrpc: run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host) fetcher.run()
def setUpClass(self): self.fetcher = Fetcher(None, None, async_mode=False) self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' time.sleep(0.5)
def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run)
def webui(ctx, host, port, cdn, scheduler_rpc, fetcher_rpc, max_rate, max_burst, username, password, need_auth, app=app): g = ctx.obj app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['cdn'] = cdn if max_rate: app.config['max_rate'] = max_rate if max_burst: app.config['max_burst'] = max_burst if username: app.config['webui_username'] = username if password: app.config['webui_password'] = password # fetcher rpc if isinstance(fetcher_rpc, six.string_types): fetcher_rpc = connect_rpc(ctx, None, fetcher_rpc) if fetcher_rpc is None: fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy app.config['fetch'] = lambda x: fetcher.fetch(x)[1] else: import umsgpack app.config['fetch'] = lambda x: umsgpack.unpackb( fetcher_rpc.fetch(x).data) if isinstance(scheduler_rpc, six.string_types): scheduler_rpc = connect_rpc(ctx, None, scheduler_rpc) if scheduler_rpc is None and os.environ.get('SCHEDULER_NAME'): app.config['scheduler_rpc'] = connect_rpc( ctx, None, 'http://%s/' % (os.environ['SCHEDULER_PORT_23333_TCP'][len('tcp://'):])) elif scheduler_rpc is None: app.config['scheduler_rpc'] = connect_rpc(ctx, None, 'http://localhost:23333/') else: app.config['scheduler_rpc'] = scheduler_rpc app.debug = g.debug g.instances.append(app) if g.get('testing_mode'): return app app.run(host=host, port=port)
def run_fetcher(g=g): from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor) fetcher.phantomjs_proxy = g.phantomjs_proxy run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host) fetcher.run()
def setUpClass(self): self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = 'localhost:25555' self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.phantomjs = subprocess.Popen(['phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555'])
def run_webui(g=g): import cPickle as pickle from pyspider.fetcher.tornado_fetcher import Fetcher fetcher = Fetcher(inqueue=None, outqueue=None, async=False) fetcher.phantomjs_proxy = g.phantomjs_proxy from pyspider.webui.app import app app.config['taskdb'] = g.taskdb app.config['projectdb'] = g.projectdb app.config['resultdb'] = g.resultdb app.config['fetch'] = lambda x: fetcher.fetch(x)[1] app.config['scheduler_rpc'] = g.scheduler_rpc #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/' if g.demo_mode: app.config['max_rate'] = 0.2 app.config['max_burst'] = 3.0 if 'WEBUI_USERNAME' in os.environ: app.config['webui_username'] = os.environ['WEBUI_USERNAME'] app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '') if not getattr(g, 'all_in_one', False): app.debug = g.debug app.run(host=g.webui_host, port=g.webui_port)