Example #1
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        try:
            self.phantomjs = subprocess.Popen(
                [
                    "phantomjs",
                    os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
                    "25555",
                ]
            )
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Example #2
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      host='0.0.0.0',
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://' + socket.gethostbyname(
            socket.gethostname()) + ':14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.splash_endpoint = 'http://127.0.0.1:8050/execute'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--bind=0.0.0.0',
            '--password=123456', '--port=14830', '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
Example #3
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--password=123456', '--port=14830',
            '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Example #4
0
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)
Example #5
0
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
    """
    create connection to message queue

    name:
        name of message queue

    rabbitmq:
        amqp://username:password@host:5672/%2F
        see https://www.rabbitmq.com/uri-spec.html
    redis:
        redis://host:6379/db
        redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
    kombu:
        kombu+transport://userid:password@hostname:port/virtual_host
        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
    builtin:
        None
    """

    if not url:
        from pyspider.libs.multiprocessing_queue import Queue
        return Queue(maxsize=maxsize)

    parsed = urlparse.urlparse(url)
    if parsed.scheme == 'amqp':
        from .rabbitmq import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    elif parsed.scheme == 'redis':
        from .redis_queue import Queue
        if ',' in parsed.netloc:
            """
            redis in cluster mode (there is no concept of 'db' in cluster mode)
            ex. redis://host1:port1,host2:port2,...,hostn:portn
            """
            cluster_nodes = []
            for netloc in parsed.netloc.split(','):
                cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])})

            return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes)

        else:
            db = parsed.path.lstrip('/').split('/')
            try:
                db = int(db[0])
            except:
                logging.warning('redis DB must zero-based numeric index, using 0 instead')
                db = 0

            password = parsed.password or None

            return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit)
    elif url.startswith('kombu+'):
        url = url[len('kombu+'):]
        from .kombu_queue import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    else:
        raise Exception('unknown connection url: %s', url)
Example #6
0
def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
    """
    create connection to message queue

    name:
        name of message queue

    rabbitmq:
        amqp://username:password@host:5672/%2F
        see https://www.rabbitmq.com/uri-spec.html
    beanstalk:
        beanstalk://host:11300/
    redis:
        redis://host:6379/db
    kombu:
        kombu+transport://userid:password@hostname:port/virtual_host
        see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
    builtin:
        None
    """

    if not url:
        from pyspider.libs.multiprocessing_queue import Queue
        return Queue(maxsize=maxsize)

    parsed = urlparse.urlparse(url)
    if parsed.scheme == 'amqp':
        from .rabbitmq import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    elif parsed.scheme == 'beanstalk':
        from .beanstalk import Queue
        return Queue(name, host=parsed.netloc, maxsize=maxsize)
    elif parsed.scheme == 'redis':
        from .redis_queue import Queue
        db = parsed.path.lstrip('/').split('/')
        try:
            db = int(db[0])
        except:
            logging.warning(
                'redis DB must zero-based numeric index, using 0 instead')
            db = 0

        password = parsed.password or None

        return Queue(name,
                     parsed.hostname,
                     parsed.port,
                     db=db,
                     maxsize=maxsize,
                     password=password,
                     lazy_limit=lazy_limit)
    elif url.startswith('kombu+'):
        url = url[len('kombu+'):]
        from .kombu_queue import Queue
        return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
    else:
        raise Exception('unknow connection url: %s', url)
Example #7
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)
Example #8
0
    def setUpClass(self):
        shutil.rmtree("./data/tests", ignore_errors=True)
        os.makedirs("./data/tests")

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)

        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)

        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(
                taskdb=get_taskdb(),
                projectdb=get_projectdb(),
                newtask_queue=self.newtask_queue,
                status_queue=self.status_queue,
                out_queue=self.scheduler2fetcher,
                data_path="./data/tests/",
                resultdb=get_resultdb(),
            )
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {"": 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)
Example #9
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue, self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()
        self.process = run_in_thread(run_processor)
        time.sleep(1)
Example #10
0
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue,
                                       self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()

        self.process = run_in_thread(run_processor)
        time.sleep(1)
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)
Example #12
0
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)
        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),
                                  newtask_queue=self.newtask_queue, status_queue=self.status_queue,
                                  out_queue=self.scheduler2fetcher, data_path="./data/tests/",
                                  resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            Scheduler.DELETE_TIME = 0
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)
Example #13
0
class TestScheduler(unittest.TestCase):
    taskdb_path = './data/tests/task.db'
    projectdb_path = './data/tests/project.db'
    resultdb_path = './data/tests/result.db'
    check_project_time = 1
    scheduler_xmlrpc_port = 23333

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)

        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)

        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' %
                                             self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(),
                                  projectdb=get_projectdb(),
                                  newtask_queue=self.newtask_queue,
                                  status_queue=self.status_queue,
                                  out_queue=self.scheduler2fetcher,
                                  data_path="./data/tests/",
                                  resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {'': 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run,
                                               port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.rpc._quit()
            self.process.join(5)
        self.xmlrpc_thread.join()
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests', ignore_errors=True)
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(self.scheduler_xmlrpc_port)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)

    def test_10_new_task_ignore(self):
        '''
        task_queue = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # unknown project: test_project
        self.assertEqual(self.rpc.size(), 0)
        self.assertEqual(len(self.rpc.get_active_tasks()), 0)

    def test_20_new_project(self):
        '''
        task_queue = [ ]
        '''
        self.projectdb.insert(
            'test_project', {
                'name': 'test_project',
                'group': 'group',
                'status': 'TODO',
                'script': 'import time\nprint(time.time())',
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })

    def test_30_update_project(self):
        '''
        task_queue = [ ]
        '''
        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            task = self.scheduler2fetcher.get(timeout=1)
        self.projectdb.update('test_project', status="DEBUG")
        time.sleep(0.1)
        self.rpc.update_project()

        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)
        self.assertEqual(
            task['taskid'], '_on_get_info'
        )  # select test_project:_on_get_info data:,_on_get_info

    def test_32_get_info(self):
        self.status_queue.put({
            'taskid': '_on_get_info',
            'project': 'test_project',
            'track': {
                'save': {}
            }
        })
        # test_project on_get_info {}

    def test_34_new_not_used_project(self):
        '''
        task_queue = []
        '''
        self.projectdb.insert(
            'test_project_not_started', {
                'name': 'test_project_not_started',
                'group': 'group',
                'status': 'RUNNING',
                'script': 'import time\nprint(time.time())',
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })
        task = self.scheduler2fetcher.get(
            timeout=5
        )  # select test_project_not_started:_on_get_info data:,_on_get_info
        self.assertEqual(task['taskid'], '_on_get_info')

    def test_35_new_task(self):
        '''
        task_queue = [ ]
        '''
        time.sleep(0.2)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
            },
        })  # new task test_project:taskid url
        # task_queue = [ test_project:taskid ]

        time.sleep(0.5)
        task = self.scheduler2fetcher.get(
            timeout=10)  # select test_project:taskid
        self.assertGreater(len(self.rpc.get_active_tasks()), 0)
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')
        self.assertEqual(task['project'], 'test_project')
        self.assertIn('schedule', task)
        self.assertIn('fetch', task)
        self.assertIn('process', task)
        self.assertIn('track', task)
        self.assertEqual(task['fetch']['data'], 'abc')

    def test_37_force_update_processing_task(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url_force_update',
            'schedule': {
                'age': 10,
                'force_update': True,
            },
        })  # restart task test_project:taskid url_force_update
        time.sleep(0.2)
        # it should not block next

    def test_40_taskdone_error_no_project(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'no_project',
            'url': 'url'
        })  # unknown project: no_project
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_50_taskdone_error_no_track(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # Bad status pack: 'track'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {}
        })  # Bad status pack: 'process'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_60_taskdone_failed_retry(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/3 test_project:taskid url
        from six.moves import queue as Queue
        # with self.assertRaises(Queue.Empty):
        # task = self.scheduler2fetcher.get(timeout=4)
        task = self.scheduler2fetcher.get(
            timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)

    def test_70_taskdone_ok(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_75_on_finished_msg(self):
        task = self.scheduler2fetcher.get(
            timeout=5)  # select test_project:on_finished data:,on_finished

        self.assertEqual(task['taskid'], 'on_finished')

        self.status_queue.put({
            'taskid': 'on_finished',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:on_finished url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_80_newtask_age_ignore(self):
        '''
        processing = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_82_newtask_via_rpc(self):
        '''
        processing = [ ]
        '''
        self.rpc.newtask({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_90_newtask_with_itag(self):
        '''
        task_queue = [ ]
        processing = [ ]
        '''
        time.sleep(0.1)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'itag': "abc",
                'retries': 1
            },
        })  # restart task test_project:taskid url

        task = self.scheduler2fetcher.get(
            timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.test_70_taskdone_ok()  # task done test_project:taskid url
        self.test_75_on_finished_msg(
        )  # select test_project:on_finished data:,on_finished

    def test_a10_newtask_restart_by_age(self):
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1
            },
        })  # restart task test_project:taskid url
        task = self.scheduler2fetcher.get(
            timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

    def test_a20_failed_retry(self):
        '''
        processing: [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/1 test_project:taskid url
        task = self.scheduler2fetcher.get(
            timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': False
                },
                'process': {
                    'ok': False
                },
            }
        })  # task failed test_project:taskid url

        self.test_75_on_finished_msg(
        )  # select test_project:on_finished data:,on_finished

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_a30_task_verify(self):
        self.assertFalse(
            self.rpc.newtask({
                #'taskid': 'taskid#',
                'project': 'test_project',
                'url': 'url',
            })
        )  # taskid not in task: {'project': 'test_project', 'url': 'url'}
        self.assertFalse(
            self.rpc.newtask({
                'taskid': 'taskid#',
                #'project': 'test_project',
                'url': 'url',
            }))  # project not in task: {'url': 'url', 'taskid': 'taskid#'}
        self.assertFalse(
            self.rpc.newtask({
                'taskid': 'taskid#',
                'project': 'test_project',
                #'url': 'url',
            })
        )  # url not in task: {'project': 'test_project', 'taskid': 'taskid#'}
        self.assertFalse(
            self.rpc.newtask({
                'taskid': 'taskid#',
                'project': 'not_exist_project',
                'url': 'url',
            }))  # unknown project: not_exist_project
        self.assertTrue(
            self.rpc.newtask({
                'taskid': 'taskid#',
                'project': 'test_project',
                'url': 'url',
            }))  # new task test_project:taskid# url

    def test_a40_success_recrawl(self):
        '''
        task_queue = [ test_project:taskid# ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
        })  # restart task test_project:taskid url
        task1 = self.scheduler2fetcher.get(
            timeout=10)  # select test_project:taskid# url
        task2 = self.scheduler2fetcher.get(
            timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task1)
        self.assertIsNotNone(task2)
        self.assertTrue(task1['taskid'] == 'taskid#'
                        or task2['taskid'] == 'taskid#')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)

    def test_a50_failed_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        for i in range(3):
            self.status_queue.put({
                'taskid': 'taskid',
                'project': 'test_project',
                'url': 'url',
                'schedule': {
                    'age': 0,
                    'retries': 1,
                    'auto_recrawl': True,
                },
                'track': {
                    'fetch': {
                        'ok': True
                    },
                    'process': {
                        'ok': False
                    },
                }
            })
            # not processing pack: test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            task = self.scheduler2fetcher.get(timeout=10)
            self.assertIsNotNone(task)
            self.assertEqual(task['taskid'], 'taskid')

    def test_a60_disable_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_38_cancel_task(self):
        current_size = self.rpc.size()
        self.newtask_queue.put({
            'taskid': 'taskid_to_cancel',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'exetime': time.time() + 30
            },
        })  # new task test_project:taskid_to_cancel url
        # task_queue = [ test_project:taskid_to_cancel ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size + 1)

        self.newtask_queue.put({
            'taskid': 'taskid_to_cancel',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'force_update': True,
                'age': 0,
                'cancel': True
            },
        })  # new cancel test_project:taskid_to_cancel url
        # task_queue = [ ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size)

    def test_x10_inqueue_limit(self):
        self.projectdb.insert(
            'test_inqueue_project', {
                'name': 'test_inqueue_project',
                'group': 'group',
                'status': 'DEBUG',
                'script': 'import time\nprint(time.time())',
                'comments': 'test project',
                'rate': 0,
                'burst': 0,
            })
        time.sleep(0.1)
        pre_size = self.rpc.size()
        for i in range(20):
            self.newtask_queue.put({
                'taskid': 'taskid%d' % i,
                'project': 'test_inqueue_project',
                'url': 'url',
                'schedule': {
                    'age': 3000,
                    'force_update': True,
                },
            })
        time.sleep(1)
        self.assertEqual(self.rpc.size() - pre_size, 10)

    def test_x20_delete_project(self):
        self.assertIsNotNone(self.projectdb.get('test_inqueue_project'))
        #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.projectdb.update('test_inqueue_project',
                              status="STOP",
                              group="lock,delete")
        time.sleep(1)
        self.assertIsNone(self.projectdb.get('test_inqueue_project'))
        self.taskdb._list_project()
        self.assertIsNone(
            self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum'))

    def test_z10_startup(self):
        self.assertTrue(self.process.is_alive())

    def test_z20_quit(self):
        self.rpc._quit()
        time.sleep(0.2)
        self.assertFalse(self.process.is_alive())
        self.assertEqual(
            self.taskdb.get_task('test_project', 'taskid')['status'],
            self.taskdb.SUCCESS)
Example #14
0
class TestProcessor(unittest.TestCase):
    resultdb_path = './data/tests/result.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.result_worker.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_bad_result(self):
        self.inqueue.put(({'project': 'test_project'}, {}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_10_bad_result_2(self):
        self.inqueue.put(({'project': 'test_project'}, {'a': 'b'}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_20_insert_result(self):
        data = {
            'a': 'b'
        }
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, data))
        time.sleep(0.5)
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 1)
        self.assertEqual(self.resultdb.count('test_project'), 1)

        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], data)

    def test_30_overwrite(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, "abc"))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], "abc")

    def test_40_insert_list(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id2',
            'url': 'url1'
        }, ['a', 'b']))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id2')
        self.assertEqual(result['result'], ['a', 'b'])
Example #15
0
class TestScheduler(unittest.TestCase):
    taskdb_path = "./data/tests/task.db"
    projectdb_path = "./data/tests/project.db"
    resultdb_path = "./data/tests/result.db"
    check_project_time = 1
    scheduler_xmlrpc_port = 23333

    @classmethod
    def setUpClass(self):
        shutil.rmtree("./data/tests", ignore_errors=True)
        os.makedirs("./data/tests")

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)

        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)

        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(
                taskdb=get_taskdb(),
                projectdb=get_projectdb(),
                newtask_queue=self.newtask_queue,
                status_queue=self.status_queue,
                out_queue=self.scheduler2fetcher,
                data_path="./data/tests/",
                resultdb=get_resultdb(),
            )
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {"": 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.rpc._quit()
            self.process.join(5)
        self.xmlrpc_thread.join()
        assert not self.process.is_alive()
        shutil.rmtree("./data/tests", ignore_errors=True)
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(self.scheduler_xmlrpc_port)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)

    def test_10_new_task_ignore(self):
        """
        task_queue = [ ]
        """
        self.newtask_queue.put(
            {"taskid": "taskid", "project": "test_project", "url": "url"}
        )  # unknown project: test_project
        self.assertEqual(self.rpc.size(), 0)
        self.assertEqual(len(self.rpc.get_active_tasks()), 0)

    def test_20_new_project(self):
        """
        task_queue = [ ]
        """
        self.projectdb.insert(
            "test_project",
            {
                "name": "test_project",
                "group": "group",
                "status": "TODO",
                "script": "import time\nprint(time.time())",
                "comments": "test project",
                "rate": 1.0,
                "burst": 10,
            },
        )

    def test_30_update_project(self):
        """
        task_queue = [ ]
        """
        from six.moves import queue as Queue

        with self.assertRaises(Queue.Empty):
            task = self.scheduler2fetcher.get(timeout=1)
        self.projectdb.update("test_project", status="DEBUG")
        time.sleep(0.1)
        self.rpc.update_project()

        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)
        self.assertEqual(task["taskid"], "_on_get_info")  # select test_project:_on_get_info data:,_on_get_info

    def test_32_get_info(self):
        self.status_queue.put({"taskid": "_on_get_info", "project": "test_project", "track": {"save": {}}})
        # test_project on_get_info {}

    def test_34_new_not_used_project(self):
        """
        task_queue = []
        """
        self.projectdb.insert(
            "test_project_not_started",
            {
                "name": "test_project_not_started",
                "group": "group",
                "status": "RUNNING",
                "script": "import time\nprint(time.time())",
                "comments": "test project",
                "rate": 1.0,
                "burst": 10,
            },
        )
        task = self.scheduler2fetcher.get(timeout=1)  # select test_project_not_started:_on_get_info data:,_on_get_info
        self.assertEqual(task["taskid"], "_on_get_info")

    def test_35_new_task(self):
        """
        task_queue = [ ]
        """
        time.sleep(0.2)
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 0},
            }
        )  # new task test_project:taskid url
        # task_queue = [ test_project:taskid ]

        time.sleep(0.5)
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid
        self.assertGreater(len(self.rpc.get_active_tasks()), 0)
        self.assertIsNotNone(task)
        self.assertEqual(task["taskid"], "taskid")
        self.assertEqual(task["project"], "test_project")
        self.assertIn("schedule", task)
        self.assertIn("fetch", task)
        self.assertIn("process", task)
        self.assertIn("track", task)
        self.assertEqual(task["fetch"]["data"], "abc")

    def test_37_force_update_processing_task(self):
        """
        processing = [ test_project:taskid ]
        """
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url_force_update",
                "schedule": {"age": 10, "force_update": True},
            }
        )  # restart task test_project:taskid url_force_update
        time.sleep(0.2)
        # it should not block next

    def test_40_taskdone_error_no_project(self):
        """
        processing = [ test_project:taskid ]
        """
        self.status_queue.put(
            {"taskid": "taskid", "project": "no_project", "url": "url"}
        )  # unknown project: no_project
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_50_taskdone_error_no_track(self):
        """
        processing = [ test_project:taskid ]
        """
        self.status_queue.put({"taskid": "taskid", "project": "test_project", "url": "url"})  # Bad status pack: 'track'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)
        self.status_queue.put(
            {"taskid": "taskid", "project": "test_project", "url": "url", "track": {}}
        )  # Bad status pack: 'process'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_60_taskdone_failed_retry(self):
        """
        processing = [ test_project:taskid ]
        """
        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "track": {"fetch": {"ok": True}, "process": {"ok": False}},
            }
        )  # task retry 0/3 test_project:taskid url
        from six.moves import queue as Queue

        # with self.assertRaises(Queue.Empty):
        # task = self.scheduler2fetcher.get(timeout=4)
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)

    def test_70_taskdone_ok(self):
        """
        processing = [ test_project:taskid ]
        """
        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "track": {"fetch": {"ok": True}, "process": {"ok": True}},
            }
        )  # task done test_project:taskid url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_75_on_finished_msg(self):
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:on_finished data:,on_finished

        self.assertEqual(task["taskid"], "on_finished")

    def test_80_newtask_age_ignore(self):
        """
        processing = [ ]
        """
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 30},
            }
        )
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_82_newtask_via_rpc(self):
        """
        processing = [ ]
        """
        self.rpc.newtask(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 30},
            }
        )
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_90_newtask_with_itag(self):
        """
        task_queue = [ ]
        processing = [ ]
        """
        time.sleep(0.1)
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"itag": "abc", "retries": 1},
            }
        )  # restart task test_project:taskid url

        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task["taskid"], "taskid")

        self.test_70_taskdone_ok()  # task done test_project:taskid url
        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

    def test_a10_newtask_restart_by_age(self):
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 0, "retries": 1},
            }
        )  # restart task test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task["taskid"], "taskid")

    def test_a20_failed_retry(self):
        """
        processing: [ test_project:taskid ]
        """
        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "track": {"fetch": {"ok": True}, "process": {"ok": False}},
            }
        )  # task retry 0/1 test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task["taskid"], "taskid")

        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "track": {"fetch": {"ok": False}, "process": {"ok": False}},
            }
        )  # task failed test_project:taskid url

        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

        from six.moves import queue as Queue

        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_a30_task_verify(self):
        self.assertFalse(
            self.rpc.newtask(
                {
                    #'taskid': 'taskid#',
                    "project": "test_project",
                    "url": "url",
                }
            )
        )  # taskid not in task: {'project': 'test_project', 'url': 'url'}
        self.assertFalse(
            self.rpc.newtask(
                {
                    "taskid": "taskid#",
                    #'project': 'test_project',
                    "url": "url",
                }
            )
        )  # project not in task: {'url': 'url', 'taskid': 'taskid#'}
        self.assertFalse(
            self.rpc.newtask(
                {
                    "taskid": "taskid#",
                    "project": "test_project",
                    #'url': 'url',
                }
            )
        )  # url not in task: {'project': 'test_project', 'taskid': 'taskid#'}
        self.assertFalse(
            self.rpc.newtask({"taskid": "taskid#", "project": "not_exist_project", "url": "url"})
        )  # unknown project: not_exist_project
        self.assertTrue(
            self.rpc.newtask({"taskid": "taskid#", "project": "test_project", "url": "url"})
        )  # new task test_project:taskid# url

    def test_a40_success_recrawl(self):
        """
        task_queue = [ test_project:taskid# ]
        """
        self.newtask_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 0, "retries": 1, "auto_recrawl": True},
            }
        )  # restart task test_project:taskid url
        task1 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid# url
        task2 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task1)
        self.assertIsNotNone(task2)
        self.assertTrue(task1["taskid"] == "taskid#" or task2["taskid"] == "taskid#")

        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "schedule": {"age": 0, "retries": 1, "auto_recrawl": True},
                "track": {"fetch": {"ok": True}, "process": {"ok": True}},
            }
        )  # task done test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)

    def test_a50_failed_recrawl(self):
        """
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        """
        for i in range(3):
            self.status_queue.put(
                {
                    "taskid": "taskid",
                    "project": "test_project",
                    "url": "url",
                    "schedule": {"age": 0, "retries": 1, "auto_recrawl": True},
                    "track": {"fetch": {"ok": True}, "process": {"ok": False}},
                }
            )
            # not processing pack: test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            task = self.scheduler2fetcher.get(timeout=10)
            self.assertIsNotNone(task)
            self.assertEqual(task["taskid"], "taskid")

    def test_a60_disable_recrawl(self):
        """
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        """
        self.status_queue.put(
            {
                "taskid": "taskid",
                "project": "test_project",
                "url": "url",
                "schedule": {"age": 0, "retries": 1},
                "track": {"fetch": {"ok": True}, "process": {"ok": True}},
            }
        )  # task done test_project:taskid url

        from six.moves import queue as Queue

        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_38_cancel_task(self):
        current_size = self.rpc.size()
        self.newtask_queue.put(
            {
                "taskid": "taskid_to_cancel",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"age": 0, "exetime": time.time() + 30},
            }
        )  # new task test_project:taskid_to_cancel url
        # task_queue = [ test_project:taskid_to_cancel ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size + 1)

        self.newtask_queue.put(
            {
                "taskid": "taskid_to_cancel",
                "project": "test_project",
                "url": "url",
                "fetch": {"data": "abc"},
                "process": {"data": "abc"},
                "schedule": {"force_update": True, "age": 0, "cancel": True},
            }
        )  # new cancel test_project:taskid_to_cancel url
        # task_queue = [ ]

        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), current_size)

    def test_x10_inqueue_limit(self):
        self.projectdb.insert(
            "test_inqueue_project",
            {
                "name": "test_inqueue_project",
                "group": "group",
                "status": "DEBUG",
                "script": "import time\nprint(time.time())",
                "comments": "test project",
                "rate": 0,
                "burst": 0,
            },
        )
        time.sleep(0.1)
        pre_size = self.rpc.size()
        for i in range(20):
            self.newtask_queue.put(
                {
                    "taskid": "taskid%d" % i,
                    "project": "test_inqueue_project",
                    "url": "url",
                    "schedule": {"age": 3000, "force_update": True},
                }
            )
        time.sleep(1)
        self.assertEqual(self.rpc.size() - pre_size, 10)

    def test_x20_delete_project(self):
        self.assertIsNotNone(self.projectdb.get("test_inqueue_project"))
        # self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.projectdb.update("test_inqueue_project", status="STOP", group="lock,delete")
        time.sleep(1)
        self.assertIsNone(self.projectdb.get("test_inqueue_project"))
        self.taskdb._list_project()
        self.assertIsNone(self.taskdb.get_task("test_inqueue_project", "taskid1"))
        self.assertNotIn("test_inqueue_project", self.rpc.counter("5m", "sum"))

    def test_z10_startup(self):
        self.assertTrue(self.process.is_alive())

    def test_z20_quit(self):
        self.rpc._quit()
        time.sleep(0.2)
        self.assertFalse(self.process.is_alive())
        self.assertEqual(self.taskdb.get_task("test_project", "taskid")["status"], self.taskdb.SUCCESS)
Example #16
0
class TestScheduler(unittest.TestCase):
    taskdb_path = './data/tests/task.db'
    projectdb_path = './data/tests/project.db'
    resultdb_path = './data/tests/result.db'
    check_project_time = 1
    scheduler_xmlrpc_port = 23333

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests', ignore_errors=True)
        os.makedirs('./data/tests')

        def get_taskdb():
            return taskdb.TaskDB(self.taskdb_path)
        self.taskdb = get_taskdb()

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()

        self.newtask_queue = Queue(10)
        self.status_queue = Queue(10)
        self.scheduler2fetcher = Queue(10)
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % self.scheduler_xmlrpc_port)

        def run_scheduler():
            scheduler = Scheduler(taskdb=get_taskdb(), projectdb=get_projectdb(),
                                  newtask_queue=self.newtask_queue, status_queue=self.status_queue,
                                  out_queue=self.scheduler2fetcher, data_path="./data/tests/",
                                  resultdb=get_resultdb())
            scheduler.UPDATE_PROJECT_INTERVAL = 0.1
            scheduler.LOOP_INTERVAL = 0.1
            scheduler.INQUEUE_LIMIT = 10
            scheduler.DELETE_TIME = 0
            scheduler.DEFAULT_RETRY_DELAY = {'': 5}
            scheduler._last_tick = int(time.time())  # not dispatch cronjob
            self.xmlrpc_thread = run_in_thread(scheduler.xmlrpc_run, port=self.scheduler_xmlrpc_port)
            scheduler.run()

        self.process = run_in_thread(run_scheduler)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.rpc._quit()
            self.process.join(5)
        self.xmlrpc_thread.join()
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests', ignore_errors=True)
        time.sleep(1)

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(self.scheduler_xmlrpc_port)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)

    def test_10_new_task_ignore(self):
        '''
        task_queue = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # unknown project: test_project
        self.assertEqual(self.rpc.size(), 0)
        self.assertEqual(len(self.rpc.get_active_tasks()), 0)

    def test_20_new_project(self):
        '''
        task_queue = [ ]
        '''
        self.projectdb.insert('test_project', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })

    def test_30_update_project(self):
        '''
        task_queue = [ ]
        '''
        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            task = self.scheduler2fetcher.get(timeout=1)
        self.projectdb.update('test_project', status="DEBUG")
        time.sleep(0.1)
        self.rpc.update_project()

        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], '_on_get_info')  # select test_project:_on_get_info data:,_on_get_info

    def test_34_new_not_used_project(self):
        '''
        task_queue = []
        '''
        self.projectdb.insert('test_project_not_started', {
            'name': 'test_project_not_started',
            'group': 'group',
            'status': 'RUNNING',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        task = self.scheduler2fetcher.get(timeout=1)  # select test_project_not_started:_on_get_info data:,_on_get_info
        self.assertEqual(task['taskid'], '_on_get_info')

    def test_35_new_task(self):
        '''
        task_queue = [ ]
        '''
        time.sleep(0.2)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
            },
        })  # new task test_project:taskid url
        # task_queue = [ test_project:taskid ]

        time.sleep(0.5)
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid
        self.assertGreater(len(self.rpc.get_active_tasks()), 0)
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')
        self.assertEqual(task['project'], 'test_project')
        self.assertIn('schedule', task)
        self.assertIn('fetch', task)
        self.assertIn('process', task)
        self.assertIn('track', task)
        self.assertEqual(task['fetch']['data'], 'abc')

    def test_37_force_update_processing_task(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url_force_update',
            'schedule': {
                'age': 10,
                'force_update': True,
            },
        })  # restart task test_project:taskid url_force_update
        time.sleep(0.2)
        # it should not block next

    def test_40_taskdone_error_no_project(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'no_project',
            'url': 'url'
        })  # unknown project: no_project
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_50_taskdone_error_no_track(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url'
        })  # Bad status pack: 'track'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {}
        })  # Bad status pack: 'process'
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 1)

    def test_60_taskdone_failed_retry(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/3 test_project:taskid url
        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            task = self.scheduler2fetcher.get(timeout=4)
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)

    def test_70_taskdone_ok(self):
        '''
        processing = [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        time.sleep(0.2)
        self.assertEqual(self.rpc.size(), 0)

    def test_75_on_finished_msg(self):
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:on_finished data:,on_finished

        self.assertEqual(task['taskid'], 'on_finished')

    def test_80_newtask_age_ignore(self):
        '''
        processing = [ ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_82_newtask_via_rpc(self):
        '''
        processing = [ ]
        '''
        self.rpc.newtask({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 30,
            },
        })
        time.sleep(0.1)
        self.assertEqual(self.rpc.size(), 0)

    def test_90_newtask_with_itag(self):
        '''
        task_queue = [ ]
        processing = [ ]
        '''
        time.sleep(0.1)
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'itag': "abc",
                'retries': 1
            },
        })  # restart task test_project:taskid url

        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.test_70_taskdone_ok()  # task done test_project:taskid url
        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

    def test_a10_newtask_restart_by_age(self):
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1
            },
        })  # restart task test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

    def test_a20_failed_retry(self):
        '''
        processing: [ test_project:taskid ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': False
                },
            }
        })  # task retry 0/1 test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=5)  # select test_project:taskid url
        self.assertIsNotNone(task)
        self.assertEqual(task['taskid'], 'taskid')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'track': {
                'fetch': {
                    'ok': False
                },
                'process': {
                    'ok': False
                },
            }
        })  # task failed test_project:taskid url

        self.test_75_on_finished_msg()  # select test_project:on_finished data:,on_finished

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_a30_task_verify(self):
        self.assertFalse(self.rpc.newtask({
            #'taskid': 'taskid#',
            'project': 'test_project',
            'url': 'url',
        }))  # taskid not in task: {'project': 'test_project', 'url': 'url'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            #'project': 'test_project',
            'url': 'url',
        }))  # project not in task: {'url': 'url', 'taskid': 'taskid#'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'test_project',
            #'url': 'url',
        }))  # url not in task: {'project': 'test_project', 'taskid': 'taskid#'}
        self.assertFalse(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'not_exist_project',
            'url': 'url',
        }))  # unknown project: not_exist_project
        self.assertTrue(self.rpc.newtask({
            'taskid': 'taskid#',
            'project': 'test_project',
            'url': 'url',
        }))  # new task test_project:taskid# url

    def test_a40_success_recrawl(self):
        '''
        task_queue = [ test_project:taskid# ]
        '''
        self.newtask_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'fetch': {
                'data': 'abc',
            },
            'process': {
                'data': 'abc',
            },
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
        })  # restart task test_project:taskid url
        task1 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid# url
        task2 = self.scheduler2fetcher.get(timeout=10)  # select test_project:taskid url
        self.assertIsNotNone(task1)
        self.assertIsNotNone(task2)
        self.assertTrue(task1['taskid'] == 'taskid#' or task2['taskid'] == 'taskid#')

        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
                'auto_recrawl': True,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url
        task = self.scheduler2fetcher.get(timeout=10)
        self.assertIsNotNone(task)

    def test_a50_failed_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        for i in range(3):
            self.status_queue.put({
                'taskid': 'taskid',
                'project': 'test_project',
                'url': 'url',
                'schedule': {
                    'age': 0,
                    'retries': 1,
                    'auto_recrawl': True,
                },
                'track': {
                    'fetch': {
                        'ok': True
                    },
                    'process': {
                        'ok': False
                    },
                }
            })
            # not processing pack: test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            # task retry 0/1 test_project:taskid url
            # select test_project:taskid url
            task = self.scheduler2fetcher.get(timeout=10)
            self.assertIsNotNone(task)
            self.assertEqual(task['taskid'], 'taskid')

    def test_a60_disable_recrawl(self):
        '''
        time_queue = [ test_project:taskid ]
        scheduler2fetcher = [ test_project:taskid# ]
        processing = [ test_project:taskid# ]
        '''
        self.status_queue.put({
            'taskid': 'taskid',
            'project': 'test_project',
            'url': 'url',
            'schedule': {
                'age': 0,
                'retries': 1,
            },
            'track': {
                'fetch': {
                    'ok': True
                },
                'process': {
                    'ok': True
                },
            }
        })  # task done test_project:taskid url

        from six.moves import queue as Queue
        with self.assertRaises(Queue.Empty):
            self.scheduler2fetcher.get(timeout=5)

    def test_x10_inqueue_limit(self):
        self.projectdb.insert('test_inqueue_project', {
            'name': 'test_inqueue_project',
            'group': 'group',
            'status': 'DEBUG',
            'script': 'import time\nprint(time.time())',
            'comments': 'test project',
            'rate': 0,
            'burst': 0,
        })
        time.sleep(0.1)
        pre_size = self.rpc.size()
        for i in range(20):
            self.newtask_queue.put({
                'taskid': 'taskid%d' % i,
                'project': 'test_inqueue_project',
                'url': 'url',
                'schedule': {
                    'age': 3000,
                    'force_update': True,
                },
            })
        time.sleep(1)
        self.assertEqual(self.rpc.size() - pre_size, 10)

    def test_x20_delete_project(self):
        self.assertIsNotNone(self.projectdb.get('test_inqueue_project'))
        #self.assertIsNotNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.projectdb.update('test_inqueue_project', status="STOP", group="lock,delete")
        time.sleep(1)
        self.assertIsNone(self.projectdb.get('test_inqueue_project'))
        self.taskdb._list_project()
        self.assertIsNone(self.taskdb.get_task('test_inqueue_project', 'taskid1'))
        self.assertNotIn('test_inqueue_project', self.rpc.counter('5m', 'sum'))

    def test_z10_startup(self):
        self.assertTrue(self.process.is_alive())

    def test_z20_quit(self):
        self.rpc._quit()
        time.sleep(0.2)
        self.assertFalse(self.process.is_alive())
        self.assertEqual(
            self.taskdb.get_task('test_project', 'taskid')['status'],
            self.taskdb.SUCCESS
        )
Example #17
0
class TestProcessor(unittest.TestCase):
    projectdb_path = './data/tests/project.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)

        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue,
                                       self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()

        self.process = run_in_thread(run_processor)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.processor.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_update_project(self):
        self.assertIsNone(self.processor.project_manager.get('test_project'))
        self.projectdb.insert(
            'test_project', {
                'name': 'test_project',
                'group': 'group',
                'status': 'TODO',
                'script': inspect.getsource(sample_handler),
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(
            self.processor.project_manager.get('test_project'))

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "not_exists",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        self.in_queue.put((task, {}))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertIsNone(self.processor.project_manager.get('not_exists'))

    def test_20_broken_project(self):
        self.assertIsNone(
            self.processor.project_manager.get('test_broken_project'))
        self.projectdb.insert(
            'test_broken_project', {
                'name': 'test_broken_project',
                'group': 'group',
                'status': 'DEBUG',
                'script': inspect.getsource(sample_handler)[:10],
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(
            self.processor.project_manager.get('test_broken_project'))
        project_data = self.processor.project_manager.get(
            'test_broken_project')
        self.assertIsNotNone(project_data.get('exception'))

    def test_30_new_task(self):
        self.assertTrue(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())
        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            self.status_queue.get()
        self.assertFalse(self.newtask_queue.empty())

    def test_40_index_page(self):
        task = None
        while not self.newtask_queue.empty():
            task = self.newtask_queue.get()[0]
        self.assertIsNotNone(task)

        fetch_result = {
            "orig_url":
            task['url'],
            "content": ("<html><body>"
                        "<a href='http://binux.me'>binux</a>"
                        "<a href='http://binux.me/中文'>binux</a>"
                        "<a href='http://binux.me/1'>1</a>"
                        "<a href='http://binux.me/1'>2</a>"
                        "</body></html>"),
            "headers": {
                'a': 'b',
                'etag': 'tag'
            },
            "status_code":
            200,
            "url":
            task['url'],
            "time":
            0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertFalse(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 200)
        self.assertEqual('tag', status['track']['fetch']['headers']['etag'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], True)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 3)
        self.assertIsNone(status['track']['process']['result'])
        self.assertEqual(status['track']['process']['logs'], '')
        self.assertIsNone(status['track']['process']['exception'])

        tasks = self.newtask_queue.get()
        self.assertEqual(len(tasks), 3)
        self.assertEqual(tasks[0]['url'], 'http://binux.me/')
        self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'),
                        task['url'])

    def test_50_fetch_error(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "index_page"
            },
            "project": "test_project",
            "taskid": "data:,test_fetch_error",
            "url": "data:,test_fetch_error"
        }

        fetch_result = {
            "orig_url": task['url'],
            "content": "test_fetch_error",
            "error": "test_fetch_error",
            "headers": {
                'a': 'b',
                'last-modified': '123'
            },
            "status_code": 598,
            "url": task['url'],
            "time": 0,
        }

        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], False)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 598)
        self.assertEqual('123',
                         status['track']['fetch']['headers']['last-modified'])
        self.assertIsNotNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 0)
        self.assertIsNone(status['track']['process']['result'])
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])

    def test_60_call_broken_project(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start",
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])
        self.assertTrue(self.newtask_queue.empty())

    def test_70_update_project(self):
        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000
        self.processor.project_manager._check_projects()
        self.assertIsNotNone(
            self.processor.project_manager.get('test_broken_project'))
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler),
        })

        # not update
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        # updated
        task['project_updatetime'] = time.time()
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], True)

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler)[:10],
        })

        # update with md5
        task['project_md5sum'] = 'testmd5'
        del task['project_updatetime']
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1

    @unittest.skipIf(six.PY3, "deprecated feature, not work for PY3")
    def test_80_import_project(self):
        self.projectdb.insert(
            'test_project2', {
                'name': 'test_project',
                'group': 'group',
                'status': 'TODO',
                'script': inspect.getsource(sample_handler),
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })
        self.projectdb.insert(
            'test_project3', {
                'name': 'test_project',
                'group': 'group',
                'status': 'TODO',
                'script': inspect.getsource(sample_handler),
                'comments': 'test project',
                'rate': 1.0,
                'burst': 10,
            })

        from projects import test_project
        self.assertIsNotNone(test_project)
        self.assertIsNotNone(test_project.Handler)

        from projects.test_project2 import Handler
        self.assertIsNotNone(Handler)

        import projects.test_project3
        self.assertIsNotNone(projects.test_project3.Handler)
Example #18
0
class TestProcessor(unittest.TestCase):
    projectdb_path = './data/tests/project.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_projectdb():
            return projectdb.ProjectDB(self.projectdb_path)
        self.projectdb = get_projectdb()
        self.in_queue = Queue(10)
        self.status_queue = Queue(10)
        self.newtask_queue = Queue(10)
        self.result_queue = Queue(10)

        def run_processor():
            self.processor = Processor(get_projectdb(), self.in_queue,
                                       self.status_queue, self.newtask_queue, self.result_queue)
            self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1
            self.processor.run()
        self.process = run_in_thread(run_processor)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.processor.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_update_project(self):
        self.assertIsNone(self.processor.project_manager.get('test_project'))
        self.projectdb.insert('test_project', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(self.processor.project_manager.get('test_project'))

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "not_exists",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        self.in_queue.put((task, {}))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertIsNone(self.processor.project_manager.get('not_exists'))

    def test_20_broken_project(self):
        self.assertIsNone(self.processor.project_manager.get('test_broken_project'))
        self.projectdb.insert('test_broken_project', {
            'name': 'test_broken_project',
            'group': 'group',
            'status': 'DEBUG',
            'script': inspect.getsource(sample_handler)[:10],
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.assertIsNone(self.processor.project_manager.get('not_exists'))
        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))
        project_data = self.processor.project_manager.get('test_broken_project')
        self.assertIsNotNone(project_data.get('exception'))

    def test_30_new_task(self):
        self.assertTrue(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())
        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            self.status_queue.get()
        self.assertFalse(self.newtask_queue.empty())

    def test_40_index_page(self):
        task = None
        while not self.newtask_queue.empty():
            task = self.newtask_queue.get()[0]
        self.assertIsNotNone(task)

        fetch_result = {
            "orig_url": task['url'],
            "content": (
                "<html><body>"
                "<a href='http://binux.me'>binux</a>"
                "<a href='http://binux.me/中文'>binux</a>"
                "<a href='http://binux.me/1'>1</a>"
                "<a href='http://binux.me/1'>2</a>"
                "</body></html>"
            ),
            "headers": {'a': 'b', 'etag': 'tag'},
            "status_code": 200,
            "url": task['url'],
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertFalse(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 200)
        self.assertEqual('tag', status['track']['fetch']['headers']['etag'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], True)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 3)
        self.assertIsNone(status['track']['process']['result'])
        self.assertEqual(status['track']['process']['logs'], '')
        self.assertIsNone(status['track']['process']['exception'])

        tasks = self.newtask_queue.get()
        self.assertEqual(len(tasks), 3)
        self.assertEqual(tasks[0]['url'], 'http://binux.me/')
        self.assertTrue(tasks[1]['url'].startswith('http://binux.me/%'), task['url'])

    def test_50_fetch_error(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "index_page"
            },
            "project": "test_project",
            "taskid": "data:,test_fetch_error",
            "url": "data:,test_fetch_error"
        }

        fetch_result = {
            "orig_url": task['url'],
            "content": "test_fetch_error",
            "error": "test_fetch_error",
            "headers": {'a': 'b', 'last-modified': '123'},
            "status_code": 598,
            "url": task['url'],
            "time": 0,
        }

        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        self.assertTrue(self.newtask_queue.empty())

        status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], False)
        self.assertEqual(status['track']['fetch']['time'], 0)
        self.assertEqual(status['track']['fetch']['status_code'], 598)
        self.assertEqual('123', status['track']['fetch']['headers']['last-modified'])
        self.assertIsNotNone(status['track']['fetch']['content'])
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(status['track']['process']['time'], 0)
        self.assertEqual(status['track']['process']['follows'], 0)
        self.assertIsNone(status['track']['process']['result'])
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])

    def test_60_call_broken_project(self):
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start",
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)
        self.assertGreater(len(status['track']['process']['logs']), 0)
        self.assertIsNotNone(status['track']['process']['exception'])
        self.assertTrue(self.newtask_queue.empty())

    def test_70_update_project(self):
        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 1000000
        self.processor.project_manager._check_projects()
        self.assertIsNotNone(self.processor.project_manager.get('test_broken_project'))
        # clear new task queue
        while not self.newtask_queue.empty():
            self.newtask_queue.get()
        # clear status queue
        while not self.status_queue.empty():
            self.status_queue.get()

        task = {
            "process": {
                "callback": "on_start"
            },
            "project": "test_broken_project",
            "taskid": "data:,on_start",
            "url": "data:,on_start"
        }
        fetch_result = {
            "orig_url": "data:,on_start",
            "content": "on_start",
            "headers": {},
            "status_code": 200,
            "url": "data:,on_start",
            "time": 0,
        }

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler),
        })

        # not update
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        # updated
        task['project_updatetime'] = time.time()
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], True)

        self.projectdb.update('test_broken_project', {
            'script': inspect.getsource(sample_handler)[:10],
        })

        # update with md5
        task['project_md5sum'] = 'testmd5'
        del task['project_updatetime']
        self.in_queue.put((task, fetch_result))
        time.sleep(1)
        self.assertFalse(self.status_queue.empty())
        while not self.status_queue.empty():
            status = self.status_queue.get()
        self.assertEqual(status['track']['fetch']['ok'], True)
        self.assertEqual(status['track']['process']['ok'], False)

        self.processor.project_manager.CHECK_PROJECTS_INTERVAL = 0.1

    @unittest.skipIf(six.PY3, "deprecated feature, not work for PY3")
    def test_80_import_project(self):
        self.projectdb.insert('test_project2', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })
        self.projectdb.insert('test_project3', {
            'name': 'test_project',
            'group': 'group',
            'status': 'TODO',
            'script': inspect.getsource(sample_handler),
            'comments': 'test project',
            'rate': 1.0,
            'burst': 10,
        })

        from projects import test_project
        self.assertIsNotNone(test_project)
        self.assertIsNotNone(test_project.Handler)

        from projects.test_project2 import Handler
        self.assertIsNotNone(Handler)

        import projects.test_project3
        self.assertIsNotNone(projects.test_project3.Handler)
class TestProcessor(unittest.TestCase):
    resultdb_path = './data/tests/result.db'

    @classmethod
    def setUpClass(self):
        shutil.rmtree('./data/tests/', ignore_errors=True)
        os.makedirs('./data/tests/')

        def get_resultdb():
            return resultdb.ResultDB(self.resultdb_path)
        self.resultdb = get_resultdb()
        self.inqueue = Queue(10)

        def run_result_worker():
            self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
            self.result_worker.run()
        self.process = run_in_thread(run_result_worker)
        time.sleep(1)

    @classmethod
    def tearDownClass(self):
        if self.process.is_alive():
            self.result_worker.quit()
            self.process.join(2)
        assert not self.process.is_alive()
        shutil.rmtree('./data/tests/', ignore_errors=True)

    def test_10_bad_result(self):
        self.inqueue.put(({'project': 'test_project'}, {}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_10_bad_result_2(self):
        self.inqueue.put(({'project': 'test_project'}, {'a': 'b'}))
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 0)
        self.assertEqual(self.resultdb.count('test_project'), 0)

    def test_20_insert_result(self):
        data = {
            'a': 'b'
        }
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, data))
        time.sleep(0.5)
        self.resultdb._list_project()
        self.assertEqual(len(self.resultdb.projects), 1)
        self.assertEqual(self.resultdb.count('test_project'), 1)

        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], data)

    def test_30_overwrite(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id1',
            'url': 'url1'
        }, "abc"))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id1')
        self.assertEqual(result['result'], "abc")

    def test_40_insert_list(self):
        self.inqueue.put(({
            'project': 'test_project',
            'taskid': 'id2',
            'url': 'url1'
        }, ['a', 'b']))
        time.sleep(0.1)
        result = self.resultdb.get('test_project', 'id2')
        self.assertEqual(result['result'], ['a', 'b'])
Example #20
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887,
                                                      passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run,
                                                 port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen([
            'pyproxy', '--username=binux', '--password=123456', '--port=14830',
            '--debug'
        ],
                                             close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen([
                'phantomjs',
                os.path.join(os.path.dirname(__file__),
                             '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'
            ])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'),
                      response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'),
                      response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/post'
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json['form'].get('binux'), '')
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'),
                      response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'),
                      response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_30_with_queue(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/post'
        request['fetch']['method'] = 'POST'
        # utf8 encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u'中文', response.json['form'], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/post'
        request['fetch']['method'] = 'POST'
        # gbk encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/delay/5'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

        response = rebuild_response(result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn('teapot', response.text)

    def test_69_no_phantomjs(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = None

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data['headers'].get('A'), 'b', response.json)
        self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)

    def test_75_phantomjs_robots(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/delay/5'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('js_script_result', result)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch'][
            'js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get?username=binux&password=123456'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'),
                      response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'),
                      response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/redirect-to?url=/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.url, self.httpbin + '/get')

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn('redirects followed', response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/cookies/set?k1=v1&k2=v2'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {
            'a': 'b',
            'k1': 'v1',
            'k2': 'v2',
            'c': 'd'
        }, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['validate_cert'] = False
        request['url'] = self.httpbin + '/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['max_redirects'] = 10
        request['url'] = self.httpbin + '/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['robots_txt'] = False
        request['url'] = self.httpbin + '/deny'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy
Example #21
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        "taskid": "taskid",
        "project": "project",
        "url": "",
        "fetch": {
            "method": "GET",
            "headers": {"Cookie": "a=b", "a": "b"},
            "cookies": {"c": "d"},
            "timeout": 60,
            "save": "abc",
        },
        "process": {"callback": "callback", "save": [1, 2, 3]},
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = "127.0.0.1:25555"
        self.rpc = xmlrpc_client.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        try:
            self.phantomjs = subprocess.Popen(
                [
                    "phantomjs",
                    os.path.join(os.path.dirname(__file__), "../pyspider/fetcher/phantomjs_fetcher.js"),
                    "25555",
                ]
            )
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        request["fetch"]["data"] = "binux"
        request["fetch"]["cookies"] = {"c": "d"}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json["form"].get("binux"), "")
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_30_with_queue(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "data:,hello"
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, "hello")

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        # utf8 encoding 中文
        request["fetch"]["data"] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u"中文", response.json["form"], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/post"
        request["fetch"]["method"] = "POST"
        # gbk encoding 中文
        request["fetch"]["data"] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/delay/5"
        request["fetch"]["timeout"] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/status/418"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn("teapot", response.text)

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        request["fetch"]["fetch_type"] = "js"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        data = json.loads(response.doc("pre").text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data["headers"].get("A"), "b", response.json)
        self.assertEqual(data["headers"].get("Cookie"), "c=d", response.json)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/delay/5"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["timeout"] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/html"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["js_script"] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("binux", result["content"])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/pyspider/ajax.html"
        request["fetch"]["fetch_type"] = "js"
        request["fetch"]["headers"]["User-Agent"] = "pyspider-test"
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertNotIn("loading", result["content"])
        self.assertIn("done", result["content"])
        self.assertIn("pyspider-test", result["content"])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = "http://www.not-exists-site.com/"
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 599)
        self.assertIn("error", result)
        self.assertIn("resolve", result["error"])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result["status_code"], 599)
        self.assertIn("error", result)
        self.assertIn("resolve", result["error"])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get?username=binux&password=123456"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.save, request["fetch"]["save"])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json["headers"].get("A"), "b", response.json)
        self.assertIn("c=d", response.json["headers"].get("Cookie"), response.json)
        self.assertIn("a=b", response.json["headers"].get("Cookie"), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/redirect-to?url=/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request["url"])
        self.assertEqual(response.url, self.httpbin + "/get")

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/redirect/10"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn("redirects followed", response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/cookies/set?k1=v1&k2=v2"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {"a": "b", "k1": "v1", "k2": "v2", "c": "d"}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["validate_cert"] = False
        request["url"] = self.httpbin + "/get"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["max_redirects"] = 10
        request["url"] = self.httpbin + "/redirect/10"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request["fetch"]["robots_txt"] = False
        request["url"] = self.httpbin + "/deny"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request["fetch"]["robots_txt"] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = "127.0.0.1:20000"

        if not self.phantomjs:
            raise unittest.SkipTest("no phantomjs")
        request = copy.deepcopy(self.sample_task_http)
        request["url"] = self.httpbin + "/get"
        request["fetch"]["fetch_type"] = "js"
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy
Example #22
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': '',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'cookies': {
                'c': 'd',
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    @classmethod
    def setUpClass(self):
        import tests.data_test_webpage
        import httpbin

        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'

        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.fetcher.phantomjs_proxy = '127.0.0.1:25555'
        self.rpc = xmlrpc_client.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        try:
            self.phantomjs = subprocess.Popen(['phantomjs',
                os.path.join(os.path.dirname(__file__),
                    '../pyspider/fetcher/phantomjs_fetcher.js'),
                '25555'])
        except OSError:
            self.phantomjs = None
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

        if self.phantomjs:
            self.phantomjs.kill()
            self.phantomjs.wait()
        self.rpc._quit()
        self.thread.join()

        assert not utils.check_port_open(5000)
        assert not utils.check_port_open(23333)
        assert not utils.check_port_open(24444)
        assert not utils.check_port_open(25555)
        assert not utils.check_port_open(14887)

        time.sleep(1)

    def test_10_http_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_15_http_post(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)

        self.assertEqual(response.json['form'].get('binux'), '')
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)

    def test_20_dataurl_get(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_30_with_queue(self):
        request= copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_40_with_rpc(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'data:,hello'
        result = umsgpack.unpackb(self.rpc.fetch(request).data)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.text, 'hello')

    def test_50_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # utf8 encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]5Lit5paH[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)
        self.assertIn(u'中文', response.json['form'], response.json)

    def test_55_base64_data(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/post'
        request['fetch']['method'] = 'POST'
        # gbk encoding 中文
        request['fetch']['data'] = "[BASE64-DATA]1tDOxA==[/BASE64-DATA]"
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, response.error)
        self.assertIsNotNone(response.json, response.content)

    def test_60_timeout(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        end_time = time.time()
        self.assertGreater(end_time - start_time, 1.5)
        self.assertLess(end_time - start_time, 4.5)

        response = rebuild_response(result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])

    def test_65_418(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/status/418'
        self.inqueue.put(request)
        task, result = self.outqueue.get()
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 418)
        self.assertIn('teapot', response.text)

    def test_69_no_phantomjs(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = None

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 501, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy

    def test_70_phantomjs_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        data = json.loads(response.doc('pre').text())
        self.assertIsNotNone(data, response.content)
        self.assertEqual(data['headers'].get('A'), 'b', response.json)
        self.assertEqual(data['headers'].get('Cookie'), 'c=d', response.json)

    def test_75_phantomjs_robots(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/deny'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_80_phantomjs_timeout(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/delay/5'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['timeout'] = 3
        start_time = time.time()
        result = self.fetcher.sync_fetch(request)
        end_time = time.time()
        self.assertGreater(end_time - start_time, 2)
        self.assertLess(end_time - start_time, 5)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('js_script_result', result)

    def test_90_phantomjs_js_script(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['js_script'] = 'function() { document.write("binux") }'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('binux', result['content'])

    def test_a100_phantomjs_sharp_url(self):
        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/pyspider/ajax.html'
        request['fetch']['fetch_type'] = 'phantomjs'
        request['fetch']['headers']['User-Agent'] = 'pyspider-test'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertNotIn('loading', result['content'])
        self.assertIn('done', result['content'])
        self.assertIn('pyspider-test', result['content'])

    def test_a110_dns_error(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = 'http://www.not-exists-site.com/'
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

        self.inqueue.put(request)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 599)
        self.assertIn('error', result)
        self.assertIn('resolve', result['error'])

    def test_a120_http_get_with_proxy_fail(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)
        self.fetcher.proxy = None

    def test_a130_http_get_with_proxy_ok(self):
        self.fetcher.proxy = self.proxy
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/get?username=binux&password=123456'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.save, request['fetch']['save'])
        self.assertIsNotNone(response.json, response.content)
        self.assertEqual(response.json['headers'].get('A'), 'b', response.json)
        self.assertIn('c=d', response.json['headers'].get('Cookie'), response.json)
        self.assertIn('a=b', response.json['headers'].get('Cookie'), response.json)
        self.fetcher.proxy = None

    def test_a140_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect-to?url=/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.orig_url, request['url'])
        self.assertEqual(response.url, self.httpbin+'/get')

    def test_a150_too_much_redirect(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)
        self.assertIn('redirects followed', response.error)

    def test_a160_cookie(self):
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin+'/cookies/set?k1=v1&k2=v2'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)
        self.assertEqual(response.cookies, {'a': 'b', 'k1': 'v1', 'k2': 'v2', 'c': 'd'}, result)

    def test_a170_validate_cert(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['validate_cert'] = False
        request['url'] = self.httpbin+'/get'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a180_max_redirects(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['max_redirects'] = 10
        request['url'] = self.httpbin+'/redirect/10'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

    def test_a200_robots_txt(self):
        request = copy.deepcopy(self.sample_task_http)
        request['fetch']['robots_txt'] = False
        request['url'] = self.httpbin+'/deny'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 200, result)

        request['fetch']['robots_txt'] = True
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 403, result)

    def test_zzzz_issue375(self):
        phantomjs_proxy = self.fetcher.phantomjs_proxy
        self.fetcher.phantomjs_proxy = '127.0.0.1:20000'

        if not self.phantomjs:
            raise unittest.SkipTest('no phantomjs')
        request = copy.deepcopy(self.sample_task_http)
        request['url'] = self.httpbin + '/get'
        request['fetch']['fetch_type'] = 'phantomjs'
        result = self.fetcher.sync_fetch(request)
        response = rebuild_response(result)

        self.assertEqual(response.status_code, 599, result)

        self.fetcher.phantomjs_proxy = phantomjs_proxy