Exemple #1
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--password=123456', '--port=14830',
            '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Exemple #2
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      host='0.0.0.0',
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(
            socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--bind=0.0.0.0',
            '--password=123456', '--port=14830', '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
Exemple #3
0
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
    """
    create connection to message queue

    name:
        name of message queue

    rabbitmq:
        amqp://username:password@host:5672/%2F
        see https://www.rabbitmq.com/uri-spec.html
    redis:
        redis://host:6379/db
        redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
    kombu:
        kombu+transport://userid:password@hostname:port/virtual_host
        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
    builtin:
        None
    """

    if not url:
        from pyspider.libs.multiprocessing_queue import Queue
        return Queue(maxsize=maxsize)

    parsed = urlparse.urlparse(url)
    if parsed.scheme == 'amqp':
        from .rabbitmq import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    elif parsed.scheme == 'redis':
        from .redis_queue import Queue
        if ',' in parsed.netloc:
            """
            redis in cluster mode (there is no concept of 'db' in cluster mode)
            ex. redis://host1:port1,host2:port2,...,hostn:portn
            """
            cluster_nodes = []
            for netloc in parsed.netloc.split(','):
                cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])})

            return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes)

        else:
            db = parsed.path.lstrip('/').split('/')
            try:
                db = int(db[0])
            except:
                logging.warning('redis DB must zero-based numeric index, using 0 instead')
                db = 0

            password = parsed.password or None

            return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit)
    elif url.startswith('kombu+'):
        url = url[len('kombu+'):]
        from .kombu_queue import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    else:
        raise Exception('unknown connection url: %s', url)
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
    """
    create connection to message queue

    name:
        name of message queue

    rabbitmq:
        amqp://username:password@host:5672/%2F
        see https://www.rabbitmq.com/uri-spec.html
    beanstalk:
        beanstalk://host:11300/
    redis:
        redis://host:6379/db
    kombu:
        kombu+transport://userid:password@hostname:port/virtual_host
        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
    builtin:
        None
    """

    if not url:
        from pyspider.libs.multiprocessing_queue import Queue
        return Queue(maxsize=maxsize)

    parsed = urlparse.urlparse(url)
    if parsed.scheme == 'amqp':
        from .rabbitmq import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    elif parsed.scheme == 'beanstalk':
        from .beanstalk import Queue
        return Queue(name, host=parsed.netloc, maxsize=maxsize)
    elif parsed.scheme == 'redis':
        from .redis_queue import Queue
        db = parsed.path.lstrip('/').split('/')
        try:
            db = int(db[0])
        except:
            logging.warning(
                'redis DB must zero-based numeric index, using 0 instead')
            db = 0

        password = parsed.password or None

        return Queue(name,
                     parsed.hostname,
                     parsed.port,
                     db=db,
                     maxsize=maxsize,
                     password=password,
                     lazy_limit=lazy_limit)
    elif url.startswith('kombu+'):
        url = url[len('kombu+'):]
        from .kombu_queue import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    else:
        raise Exception('unknow connection url: %s', url)
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)

        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)

        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' %
                                             self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(),
                                  projectdb=get_projectdb(),
                                  newtask_queue=self.newtask_queue,
                                  status_queue=self.status_queue,
                                  out_queue=self.scheduler2fetcher,
                                  data_path="./data/tests/",
                                  resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {'': 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run,
                                               port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)
Exemple #6
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue, self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()
        self.process = run_in_thread(run_processor)
        time.sleep(1)
Exemple #7
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)