def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) self.httpbin = 'http://127.0.0.1:14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.phantomjs_proxy = '127.0.0.1:25555' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830' try: self.phantomjs = subprocess.Popen([ 'phantomjs', os.path.join(os.path.dirname(__file__), '../pyspider/fetcher/phantomjs_fetcher.js'), '25555' ]) except OSError: self.phantomjs = None time.sleep(0.5)
def setUpClass(self): import tests.data_test_webpage import httpbin self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, host='0.0.0.0', port=14887, passthrough_errors=False) self.httpbin = 'http://' + socket.gethostbyname( socket.gethostname()) + ':14887' self.inqueue = Queue(10) self.outqueue = Queue(10) self.fetcher = Fetcher(self.inqueue, self.outqueue) self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute' self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444) self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444) self.thread = utils.run_in_thread(self.fetcher.run) self.proxy_thread = subprocess.Popen([ 'pyproxy', '--username=binux', '--bind=0.0.0.0', '--password=123456', '--port=14830', '--debug' ], close_fds=True) self.proxy = '127.0.0.1:14830'
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html redis: redis://host:6379/db redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from pyspider.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) elif parsed.scheme == 'redis': from .redis_queue import Queue if ',' in parsed.netloc: """ redis in cluster mode (there is no concept of 'db' in cluster mode) ex. redis://host1:port1,host2:port2,...,hostn:portn """ cluster_nodes = [] for netloc in parsed.netloc.split(','): cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes) else: db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: logging.warning('redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: raise Exception('unknown connection url: %s', url)
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): """ create connection to message queue name: name of message queue rabbitmq: amqp://username:password@host:5672/%2F see https://www.rabbitmq.com/uri-spec.html beanstalk: beanstalk://host:11300/ redis: redis://host:6379/db kombu: kombu+transport://userid:password@hostname:port/virtual_host see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls builtin: None """ if not url: from pyspider.libs.multiprocessing_queue import Queue return Queue(maxsize=maxsize) parsed = urlparse.urlparse(url) if parsed.scheme == 'amqp': from .rabbitmq import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) elif parsed.scheme == 'beanstalk': from .beanstalk import Queue return Queue(name, host=parsed.netloc, maxsize=maxsize) elif parsed.scheme == 'redis': from .redis_queue import Queue db = parsed.path.lstrip('/').split('/') try: db = int(db[0]) except: logging.warning( 'redis DB must zero-based numeric index, using 0 instead') db = 0 password = parsed.password or None return Queue(name, parsed.hostname, parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) elif url.startswith('kombu+'): url = url[len('kombu+'):] from .kombu_queue import Queue return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) else: raise Exception('unknow connection url: %s', url)
def setUpClass(self): shutil.rmtree('./data/tests', ignore_errors=True) os.makedirs('./data/tests') def get_taskdb(): return taskdb.TaskDB(self.taskdb_path) self.taskdb = get_taskdb() def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.newtask_queue = Queue(10) self.status_queue = Queue(10) self.scheduler2fetcher = Queue(10) self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port) def run_scheduler(): scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(), newtask_queue=self.newtask_queue, status_queue=self.status_queue, out_queue=self.scheduler2fetcher, data_path="./data/tests/", resultdb=get_resultdb()) scheduler.UPDATE_PROJECT_INTERVAL = 0.1 scheduler.LOOP_INTERVAL = 0.1 scheduler.INQUEUE_LIMIT = 10 scheduler.DELETE_TIME = 0 scheduler.DEFAULT_RETRY_DELAY = {'': 5} scheduler._last_tick = int(time.time()) # not dispatch cronjob self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port) scheduler.run() self.process = run_in_thread(run_scheduler) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_projectdb(): return projectdb.ProjectDB(self.projectdb_path) self.projectdb = get_projectdb() self.in_queue = Queue(10) self.status_queue = Queue(10) self.newtask_queue = Queue(10) self.result_queue = Queue(10) def run_processor(): self.processor = Processor(get_projectdb(), self.in_queue, self.status_queue, self.newtask_queue, self.result_queue) self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1 self.processor.run() self.process = run_in_thread(run_processor) time.sleep(1)
def setUpClass(self): shutil.rmtree('./data/tests/', ignore_errors=True) os.makedirs('./data/tests/') def get_resultdb(): return resultdb.ResultDB(self.resultdb_path) self.resultdb = get_resultdb() self.inqueue = Queue(10) def run_result_worker(): self.result_worker = ResultWorker(get_resultdb(), self.inqueue) self.result_worker.run() self.process = run_in_thread(run_result_worker) time.sleep(1)