Example #1
0
def run_processor(g=g):
    from pyspider.processor import Processor
    processor = Processor(projectdb=g.projectdb,
            inqueue=g.fetcher2processor, status_queue=g.status_queue,
            newtask_queue=g.newtask_queue, result_queue=g.processor2result)
    g.processor = processor
    processor.run()
Example #2
0
 def setUpClass(self):
     self.projectdb = ProjectDB([
         os.path.join(os.path.dirname(__file__),
                      'data_fetcher_processor_handler.py')
     ])
     self.fetcher = Fetcher(None, None, async=False)
     self.status_queue = Queue()
     self.newtask_queue = Queue()
     self.result_queue = Queue()
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                   port=14887,
                                                   passthrough_errors=False)
     self.httpbin = 'http://127.0.0.1:14887'
     self.proxy_thread = subprocess.Popen([
         'pyproxy', '--username=binux', '--password=123456', '--port=14830',
         '--debug'
     ],
                                          close_fds=True)
     self.proxy = '127.0.0.1:14830'
     self.processor = Processor(projectdb=self.projectdb,
                                inqueue=None,
                                status_queue=self.status_queue,
                                newtask_queue=self.newtask_queue,
                                result_queue=self.result_queue)
     self.project_name = 'data_fetcher_processor_handler'
     time.sleep(0.5)
Example #3
0
def run_processor(g=g):
    from pyspider.processor import Processor
    processor = Processor(projectdb=g.projectdb,
                          inqueue=g.fetcher2processor,
                          status_queue=g.status_queue,
                          newtask_queue=g.newtask_queue,
                          result_queue=g.processor2result)
    g.processor = processor
    processor.run()
Example #4
0
def processor(ctx):
    g = ctx.obj
    from pyspider.processor import Processor
    processor = Processor(projectdb=g.projectdb,
            inqueue=g.fetcher2processor, status_queue=g.status_queue,
            newtask_queue=g.newtask_queue, result_queue=g.processor2result)
    g.instances.append(processor)
    
    processor.run()
Example #5
0
def processor(ctx, Processor=Processor):
    g = ctx.obj
    processor = Processor(projectdb=g.projectdb,
                          inqueue=g.fetcher2processor, status_queue=g.status_queue,
                          newtask_queue=g.newtask_queue, result_queue=g.processor2result)

    g.instances.append(processor)
    if g.get('testing_mode'):
        return processor

    processor.run()
Example #6
0
 def setUpClass(self):
     self.projectdb = ProjectDB([
         os.path.join(os.path.dirname(__file__),
                      'data_fetcher_processor_handler.py')
     ])
     self.fetcher = Fetcher(None, None, async=False)
     self.status_queue = Queue()
     self.newtask_queue = Queue()
     self.result_queue = Queue()
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                   port=14887)
     self.httpbin = 'http://127.0.0.1:14887'
     self.processor = Processor(projectdb=self.projectdb,
                                inqueue=None,
                                status_queue=self.status_queue,
                                newtask_queue=self.newtask_queue,
                                result_queue=self.result_queue)
     self.project_name = 'data_fetcher_processor_handler'
     time.sleep(0.5)
 def setUpClass(self):
     self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])
     self.fetcher = Fetcher(None, None, async=False)
     self.status_queue = Queue()
     self.newtask_queue = Queue()
     self.result_queue = Queue()
     self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
     self.httpbin = 'http://127.0.0.1:14887'
     self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                           '--password=123456', '--port=14830',
                                           '--debug'], close_fds=True)
     self.proxy = '127.0.0.1:14830'
     self.processor = Processor(projectdb=self.projectdb,
                                inqueue=None,
                                status_queue=self.status_queue,
                                newtask_queue=self.newtask_queue,
                                result_queue=self.result_queue)
     self.project_name = 'data_fetcher_processor_handler'
     time.sleep(0.5)
class TestFetcherProcessor(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])
        self.fetcher = Fetcher(None, None, async=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
        self.httpbin = 'http://127.0.0.1:14887'
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        self.processor = Processor(projectdb=self.projectdb,
                                   inqueue=None,
                                   status_queue=self.status_queue,
                                   newtask_queue=self.newtask_queue,
                                   result_queue=self.result_queue)
        self.project_name = 'data_fetcher_processor_handler'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    def crawl(self, url=None, track=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        if isinstance(task, list):
            task = task[0]
        task['track'] = track
        result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    def status_ok(self, status, type):
        if not status:
            return False
        return status.get('track', {}).get(type, {}).get('ok', False)

    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch'))
        self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process'))

    def __getattr__(self, name):
        return name

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 'not_send_status')

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status['track']['fetch']['error'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertFalse(status['track']['fetch']['headers'])
        self.assertFalse(status['track']['process']['logs'])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin+'/status/418', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertIn('HTTP 418', status['track']['fetch']['error'])
        self.assertTrue(status['track']['fetch']['content'], '')
        self.assertTrue(status['track']['fetch']['headers'])
        self.assertTrue(status['track']['process']['logs'])
        self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs'])
        self.assertFalse(newtasks)


        status, newtasks, result = self.crawl(self.httpbin+'/status/400', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin+'/status/500', callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(self.httpbin+'/status/302',
                                              allow_redirects=False,
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delete', method='DELETE', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin+'/get', method='DELETE', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get', params={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})

    def test_60_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post', data={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect-to?url=/get', callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin+'/get')
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect/10', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(status['track']['fetch']['status_code'], 599)
        self.assertIn('redirects followed', status['track']['fetch']['error'])

    def test_90_files(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              data={
                                                  'roy': 'binux',
                                                  #'中文': '.', # FIXME: not work
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux'})
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'a': 'b',
                                                  'C-d': 'e-F',
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('A'), 'b')
        self.assertEqual(result['headers'].get('C-D'), 'e-F')

    def test_a115_user_agent(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              user_agent='binux', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('User-Agent'), 'binux')


    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a130_cookies_with_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'Cookie': 'g=h; I=j',
                                              },
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('g=h', result['headers'].get('Cookie'))
        self.assertIn('I=j', result['headers'].get('Cookie'))
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})

    def test_a145_redirect_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delay/2', timeout=1, callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(int(status['track']['fetch']['time']), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', etag='abc', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modified(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', last_modified='0', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              save={'roy': 'binux', u'中文': 'value'})

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              taskid='binux-taskid')

        self.assertStatusOk(status)
        self.assertEqual(status['taskid'], 'binux-taskid')
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a200_no_proxy(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a200'
                                              }, proxy=False, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.fetcher.proxy = old_proxy

    def test_a210_proxy_failed(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a210'
                                              }, callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 403)
        self.fetcher.proxy = old_proxy

    def test_a220_proxy_ok(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a220',
                                                  'username': '******',
                                                  'password': '******',
                                              }, callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)
        self.fetcher.proxy = old_proxy

    def test_a230_proxy_parameter_fail(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a230',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(result, 403)

    def test_a240_proxy_parameter_ok(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post',
                                              method='POST',
                                              data={
                                                  'test': 'a240',
                                                  'username': '******',
                                                  'password': '******',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a250_proxy_userpass(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post',
                                              method='POST',
                                              data={
                                                  'test': 'a250',
                                              }, proxy='binux:123456@'+self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a260_process_save(self):
        status, newtasks, result = self.crawl(callback=self.set_process_save)

        self.assertStatusOk(status)
        self.assertIn('roy', status['track']['save'])
        self.assertEqual(status['track']['save']['roy'], 'binux')

        status, newtasks, result = self.crawl(callback=self.get_process_save,
                                              track=status['track'])

        self.assertStatusOk(status)
        self.assertIn('roy', result)
        self.assertEqual(result['roy'], 'binux')


    def test_zzz_links(self):
        status, newtasks, result = self.crawl(self.httpbin+'/links/10/0', callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_zzz_html(self):
        status, newtasks, result = self.crawl(self.httpbin+'/html', callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, 'Herman Melville - Moby-Dick')

    def test_zzz_etag_enabled(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status, newtasks, result = self.crawl(self.httpbin+'/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_zzz_etag_not_working(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status['track']['process']['ok'] = False
        status, newtasks, result = self.crawl(self.httpbin+'/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

    def test_zzz_unexpected_crawl_argument(self):
        with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"):
            self.crawl(self.httpbin+'/cache', cookie={}, callback=self.json)

    def test_zzz_curl_get(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['headers'].get('Binux-Header'), 'Binux-Value')

    def test_zzz_curl_post(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['form'].get('Binux-Key'), '中文 value')

    def test_zzz_curl_put(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="Binux-Key"\r\n\r\n%E4%B8%AD%E6%96%87+value\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="fileUpload1"; filename="1"\r\nContent-Type: application/octet-stream\r\n\r\n\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\r\n' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertIn('fileUpload1', result['files'], result)

    def test_zzz_curl_no_url(self):
        with self.assertRaisesRegexp(TypeError, 'no URL'):
            status, newtasks, result = self.crawl(
                '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''',
                callback=self.json)

    def test_zzz_curl_bad_option(self):
        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin,
                callback=self.json)

        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin,
                callback=self.json)


    def test_zzz_robots_txt(self):
        status, newtasks, result = self.crawl(self.httpbin+'/deny', robots_txt=True, callback=self.catch_http_error)

        self.assertEqual(result, 403)


    def test_zzz_connect_timeout(self):
        start_time = time.time()
        status, newtasks, result = self.crawl('http://1.1.1.1/', connect_timeout=5, callback=self.catch_http_error)
        end_time = time.time()
        self.assertTrue(5 <= end_time - start_time <= 6)
class TestFetcherProcessor(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])
        self.fetcher = Fetcher(None, None, async=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = 'http://127.0.0.1:14887'
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        self.processor = Processor(projectdb=self.projectdb,
                                   inqueue=None,
                                   status_queue=self.status_queue,
                                   newtask_queue=self.newtask_queue,
                                   result_queue=self.result_queue)
        self.project_name = 'data_fetcher_processor_handler'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    def crawl(self, url=None, track=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        if isinstance(task, list):
            task = task[0]
        task['track'] = track
        result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    def status_ok(self, status, type):
        if not status:
            return False
        return status.get('track', {}).get(type, {}).get('ok', False)

    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch'))
        self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process'))

    def __getattr__(self, name):
        return name

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 'not_send_status')

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status['track']['fetch']['error'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertFalse(status['track']['fetch']['headers'])
        self.assertFalse(status['track']['process']['logs'])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin+'/status/418', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertIn('HTTP 418', status['track']['fetch']['error'])
        self.assertTrue(status['track']['fetch']['content'], '')
        self.assertTrue(status['track']['fetch']['headers'])
        self.assertTrue(status['track']['process']['logs'])
        self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs'])
        self.assertFalse(newtasks)


        status, newtasks, result = self.crawl(self.httpbin+'/status/400', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin+'/status/500', callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(self.httpbin+'/status/302',
                                              allow_redirects=False,
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delete', method='DELETE', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin+'/get', method='DELETE', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get', params={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})

    def test_60_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post', data={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect-to?url=/get', callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin+'/get')
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect/10', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(status['track']['fetch']['status_code'], 599)
        self.assertIn('redirects followed', status['track']['fetch']['error'])

    def test_90_files(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              data={
                                                  'roy': 'binux',
                                                  #'中文': '.', # FIXME: not work
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux'})
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'a': 'b',
                                                  'C-d': 'e-F',
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('A'), 'b')
        self.assertEqual(result['headers'].get('C-D'), 'e-F')

    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a130_cookies_with_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'Cookie': 'g=h; I=j',
                                              },
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('g=h', result['headers'].get('Cookie'))
        self.assertIn('I=j', result['headers'].get('Cookie'))
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})

    def test_a145_redirect_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delay/2', timeout=1, callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(int(status['track']['fetch']['time']), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', etag='abc', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modifed(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', last_modifed='0', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              save={'roy': 'binux', u'中文': 'value'})

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.get_save,
                                              taskid='binux-taskid')

        self.assertStatusOk(status)
        self.assertEqual(status['taskid'], 'binux-taskid')
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a200_no_proxy(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a200'
                                              }, proxy=False, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.fetcher.proxy = old_proxy

    def test_a210_proxy_failed(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a210'
                                              }, callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 403)
        self.fetcher.proxy = old_proxy

    def test_a220_proxy_ok(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a220',
                                                  'username': '******',
                                                  'password': '******',
                                              }, callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)
        self.fetcher.proxy = old_proxy

    def test_a230_proxy_parameter_fail(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a230',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(result, 403)

    def test_a240_proxy_parameter_ok(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post',
                                              method='POST',
                                              data={
                                                  'test': 'a240',
                                                  'username': '******',
                                                  'password': '******',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a250_proxy_userpass(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post',
                                              method='POST',
                                              data={
                                                  'test': 'a250',
                                              }, proxy='binux:123456@'+self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a260_process_save(self):
        status, newtasks, result = self.crawl(callback=self.set_process_save)

        self.assertStatusOk(status)
        self.assertIn('roy', status['track']['save'])
        self.assertEqual(status['track']['save']['roy'], 'binux')

        status, newtasks, result = self.crawl(callback=self.get_process_save,
                                              track=status['track'])

        self.assertStatusOk(status)
        self.assertIn('roy', result)
        self.assertEqual(result['roy'], 'binux')


    def test_zzz_links(self):
        status, newtasks, result = self.crawl(self.httpbin+'/links/10/0', callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_zzz_html(self):
        status, newtasks, result = self.crawl(self.httpbin+'/html', callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, 'Herman Melville - Moby-Dick')

    def test_zzz_etag_enabled(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status, newtasks, result = self.crawl(self.httpbin+'/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_zzz_etag_not_working(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status['track']['process']['ok'] = False
        status, newtasks, result = self.crawl(self.httpbin+'/cache',
                                              track=status['track'], callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

    def test_zzz_unexpected_crawl_argument(self):
        with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"):
            self.crawl(self.httpbin+'/cache', cookie={}, callback=self.json)

    def test_zzz_curl_get(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['headers'].get('Binux-Header'), 'Binux-Value')

    def test_zzz_curl_post(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result['form'].get('Binux-Key'), '中文 value')

    def test_zzz_curl_put(self):
        status, newtasks, result = self.crawl("curl '"+self.httpbin+'''/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="Binux-Key"\r\n\r\n%E4%B8%AD%E6%96%87+value\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="fileUpload1"; filename="1"\r\nContent-Type: application/octet-stream\r\n\r\n\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\r\n' --compressed''', callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertIn('fileUpload1', result['files'], result)

    def test_zzz_curl_no_url(self):
        with self.assertRaisesRegexp(TypeError, 'no URL'):
            status, newtasks, result = self.crawl(
                '''curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed''',
                callback=self.json)

    def test_zzz_curl_bad_option(self):
        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v''' % self.httpbin,
                callback=self.json)

        with self.assertRaisesRegexp(TypeError, 'Unknow curl option'):
            status, newtasks, result = self.crawl(
                '''curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' ''' % self.httpbin,
                callback=self.json)


    def test_zzz_robots_txt(self):
        status, newtasks, result = self.crawl(self.httpbin+'/deny', robots_txt=True, callback=self.catch_http_error)

        self.assertEqual(result, 403)
Example #10
0
class TestFetcherProcessor(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([
            os.path.join(os.path.dirname(__file__),
                         'data_fetcher_processor_handler.py')
        ])
        self.fetcher = Fetcher(None, None, async=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run,
                                                      port=14887)
        self.httpbin = 'http://127.0.0.1:14887'
        self.processor = Processor(projectdb=self.projectdb,
                                   inqueue=None,
                                   status_queue=self.status_queue,
                                   newtask_queue=self.newtask_queue,
                                   result_queue=self.result_queue)
        self.project_name = 'data_fetcher_processor_handler'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    def crawl(self, url=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        assert not isinstance(task, list), 'url list is not allowed'
        task, result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    def status_ok(self, status, type):
        if not status:
            return False
        return status.get('track', {}).get(type, {}).get('ok', False)

    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, 'fetch'),
                        status.get('track', {}).get('fetch'))
        self.assertTrue(self.status_ok(status, 'process'),
                        status.get('track', {}).get('process'))

    def __getattr__(self, name):
        return name

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 'not_send_status')

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status['track']['fetch']['error'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertFalse(status['track']['fetch']['headers'])
        self.assertFalse(status['track']['process']['logs'])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin + '/status/418',
                                              callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertEqual(status['track']['fetch']['error'],
                         'HTTP 418: Unknown')
        self.assertTrue(status['track']['fetch']['content'], '')
        self.assertTrue(status['track']['fetch']['headers'])
        self.assertTrue(status['track']['process']['logs'])
        self.assertIn('HTTPError: HTTP 418: Unknown',
                      status['track']['process']['logs'])
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + '/status/400',
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin + '/status/500',
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(self.httpbin + '/status/302',
                                              allow_redirects=False,
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin + '/delete',
                                              method='DELETE',
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              method='PATCH',
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              params={
                                                  'roy': 'binux',
                                                  u'中文': '.',
                                              },
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})

    def test_60_data(self):
        status, newtasks, result = self.crawl(self.httpbin + '/post',
                                              data={
                                                  'roy': 'binux',
                                                  u'中文': '.',
                                              },
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin +
                                              '/redirect-to?url=/get',
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status['track']['fetch']['redirect_url'],
                         self.httpbin + '/get')
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin + '/redirect/10',
                                              callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(status['track']['fetch']['status_code'], 599)
        self.assertIn('redirects followed', status['track']['fetch']['error'])

    def test_90_files(self):
        status, newtasks, result = self.crawl(
            self.httpbin + '/put',
            method='PUT',
            files={os.path.basename(__file__): open(__file__).read()},
            callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(
            self.httpbin + '/put',
            method='PUT',
            files={os.path.basename(__file__): open(__file__).read()},
            data={
                'roy': 'binux',
                #'中文': '.', # FIXME: not work
            },
            callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux'})
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              headers={
                                                  'a': 'b',
                                                  'C-d': 'e-F',
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('A'), 'b')
        self.assertEqual(result['headers'].get('C-D'), 'e-F')

    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(self.httpbin + '/get',
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    # FIXME: cookies can't use with Cookie header
    #def test_a130_cookies_with_headers(self):
    #status, newtasks, result = self.crawl(self.httpbin+'/get',
    #headers={
    #'Cookie': 'g=h; I=j',
    #},
    #cookies={
    #'a': 'b',
    #'C-d': 'e-F'
    #}, callback=self.json)
    #self.assertStatusOk(status)
    #self.assertFalse(newtasks)
    #self.assertEqual(result['headers'].get('Cookie'), 'g=h; I=j; a=b; C-d=e-F')

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin +
                                              '/cookies/set?k1=v1&k2=v2',
                                              callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin + '/delay/2',
                                              timeout=1,
                                              callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(int(status['track']['fetch']['time']), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache',
                                              etag='abc',
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modifed(self):
        status, newtasks, result = self.crawl(self.httpbin + '/cache',
                                              last_modifed='0',
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.save,
                                              save={
                                                  'roy': 'binux',
                                                  u'中文': 'value'
                                              })

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.save,
                                              taskid='binux-taskid')

        self.assertStatusOk(status)
        self.assertEqual(status['taskid'], 'binux-taskid')
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_links(self):
        status, newtasks, result = self.crawl(self.httpbin + '/links/10/0',
                                              callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_html(self):
        status, newtasks, result = self.crawl(self.httpbin + '/html',
                                              callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, 'Herman Melville - Moby-Dick')
class TestFetcherProcessor(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), "data_fetcher_processor_handler.py")])
        self.fetcher = Fetcher(None, None, async=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = "http://127.0.0.1:14887"
        self.proxy_thread = subprocess.Popen(
            ["pyproxy", "--username=binux", "--password=123456", "--port=14830", "--debug"], close_fds=True
        )
        self.proxy = "127.0.0.1:14830"
        self.processor = Processor(
            projectdb=self.projectdb,
            inqueue=None,
            status_queue=self.status_queue,
            newtask_queue=self.newtask_queue,
            result_queue=self.result_queue,
        )
        self.project_name = "data_fetcher_processor_handler"
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    def crawl(self, url=None, track=None, **kwargs):
        if url is None and kwargs.get("callback"):
            url = dataurl.encode(utils.text(kwargs.get("callback")))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data["instance"]
        instance._reset()
        task = instance.crawl(url, **kwargs)
        if isinstance(task, list):
            task = task[0]
        task["track"] = track
        task, result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    def status_ok(self, status, type):
        if not status:
            return False
        return status.get("track", {}).get(type, {}).get("ok", False)

    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, "fetch"), status.get("track", {}).get("fetch"))
        self.assertTrue(self.status_ok(status, "process"), status.get("track", {}).get("process"))

    def __getattr__(self, name):
        return name

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, "not_send_status")

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status["track"]["fetch"]["error"])
        self.assertIsNone(status["track"]["fetch"]["content"])
        self.assertFalse(status["track"]["fetch"]["headers"])
        self.assertFalse(status["track"]["process"]["logs"])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin + "/status/418", callback=self.json)

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertFalse(self.status_ok(status, "process"))
        self.assertIn("HTTP 418", status["track"]["fetch"]["error"])
        self.assertTrue(status["track"]["fetch"]["content"], "")
        self.assertTrue(status["track"]["fetch"]["headers"])
        self.assertTrue(status["track"]["process"]["logs"])
        self.assertIn("HTTPError: HTTP 418", status["track"]["process"]["logs"])
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + "/status/400", callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin + "/status/500", callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(
            self.httpbin + "/status/302", allow_redirects=False, callback=self.catch_http_error
        )
        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin + "/delete", method="DELETE", callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin + "/get", method="PATCH", callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", params={"roy": "binux", u"中文": "."}, callback=self.json
        )

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result["args"], {"roy": "binux", u"中文": "."})

    def test_60_data(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/post", data={"roy": "binux", u"中文": "."}, callback=self.json
        )

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result["form"], {"roy": "binux", u"中文": "."})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin + "/redirect-to?url=/get", callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status["track"]["fetch"]["redirect_url"], self.httpbin + "/get")
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin + "/redirect/10", callback=self.json)

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertFalse(self.status_ok(status, "process"))
        self.assertFalse(newtasks)
        self.assertEqual(status["track"]["fetch"]["status_code"], 599)
        self.assertIn("redirects followed", status["track"]["fetch"]["error"])

    def test_90_files(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/put",
            method="PUT",
            files={os.path.basename(__file__): open(__file__).read()},
            callback=self.json,
        )

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result["files"])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/put",
            method="PUT",
            files={os.path.basename(__file__): open(__file__).read()},
            data={
                "roy": "binux",
                #'中文': '.', # FIXME: not work
            },
            callback=self.json,
        )
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result["form"], {"roy": "binux"})
        self.assertIn(os.path.basename(__file__), result["files"])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", headers={"a": "b", "C-d": "e-F"}, callback=self.json
        )
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result["headers"].get("A"), "b")
        self.assertEqual(result["headers"].get("C-D"), "e-F")

    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", cookies={"a": "b", "C-d": "e-F"}, callback=self.json
        )
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn("a=b", result["headers"].get("Cookie"))
        self.assertIn("C-d=e-F", result["headers"].get("Cookie"))

    def test_a130_cookies_with_headers(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", headers={"Cookie": "g=h; I=j"}, cookies={"a": "b", "C-d": "e-F"}, callback=self.json
        )
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn("g=h", result["headers"].get("Cookie"))
        self.assertIn("I=j", result["headers"].get("Cookie"))
        self.assertIn("a=b", result["headers"].get("Cookie"))
        self.assertIn("C-d=e-F", result["headers"].get("Cookie"))

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cookies/set?k1=v1&k2=v2", callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {"k1": "v1", "k2": "v2"})

    def test_a145_redirect_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cookies/set?k1=v1&k2=v2", callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result["cookies"], {"k1": "v1", "k2": "v2"})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin + "/delay/2", timeout=1, callback=self.json)

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertFalse(self.status_ok(status, "process"))
        self.assertFalse(newtasks)
        self.assertEqual(int(status["track"]["fetch"]["time"]), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cache", etag="abc", callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modifed(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cache", last_modifed="0", callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.save, save={"roy": "binux", u"中文": "value"})

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {"roy": "binux", u"中文": "value"})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.save, taskid="binux-taskid")

        self.assertStatusOk(status)
        self.assertEqual(status["taskid"], "binux-taskid")
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a200_no_proxy(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", params={"test": "a200"}, proxy=False, callback=self.json
        )

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.fetcher.proxy = old_proxy

    def test_a210_proxy_failed(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", params={"test": "a210"}, callback=self.catch_http_error
        )

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 403)
        self.fetcher.proxy = old_proxy

    def test_a220_proxy_ok(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(
            self.httpbin + "/get",
            params={"test": "a220", "username": "******", "password": "******"},
            callback=self.catch_http_error,
        )

        self.assertStatusOk(status)
        self.assertEqual(result, 200)
        self.fetcher.proxy = old_proxy

    def test_a230_proxy_parameter_fail(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/get", params={"test": "a230"}, proxy=self.proxy, callback=self.catch_http_error
        )

        self.assertFalse(self.status_ok(status, "fetch"))
        self.assertTrue(self.status_ok(status, "process"))
        self.assertEqual(result, 403)

    def test_a240_proxy_parameter_ok(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/post",
            method="POST",
            data={"test": "a240", "username": "******", "password": "******"},
            proxy=self.proxy,
            callback=self.catch_http_error,
        )

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_a250_proxy_userpass(self):
        status, newtasks, result = self.crawl(
            self.httpbin + "/post",
            method="POST",
            data={"test": "a250"},
            proxy="binux:123456@" + self.proxy,
            callback=self.catch_http_error,
        )

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_zzz_links(self):
        status, newtasks, result = self.crawl(self.httpbin + "/links/10/0", callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_zzz_html(self):
        status, newtasks, result = self.crawl(self.httpbin + "/html", callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, "Herman Melville - Moby-Dick")

    def test_zzz_etag_enabled(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cache", callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status, newtasks, result = self.crawl(self.httpbin + "/cache", track=status["track"], callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_zzz_etag_not_working(self):
        status, newtasks, result = self.crawl(self.httpbin + "/cache", callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

        status["track"]["process"]["ok"] = False
        status, newtasks, result = self.crawl(self.httpbin + "/cache", track=status["track"], callback=self.json)
        self.assertStatusOk(status)
        self.assertTrue(result)

    def test_zzz_unexpected_crawl_argument(self):
        with self.assertRaisesRegexp(TypeError, "unexpected keyword argument"):
            self.crawl(self.httpbin + "/cache", cookie={}, callback=self.json)

    def test_zzz_curl_get(self):
        status, newtasks, result = self.crawl(
            "curl '"
            + self.httpbin
            + """/get' -H 'DNT: 1' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Binux-Header: Binux-Value' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' --compressed""",
            callback=self.json,
        )
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result["headers"].get("Binux-Header"), "Binux-Value")

    def test_zzz_curl_post(self):
        status, newtasks, result = self.crawl(
            "curl '"
            + self.httpbin
            + """/post' -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data 'Binux-Key=%E4%B8%AD%E6%96%87+value' --compressed""",
            callback=self.json,
        )
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertTrue(result["form"].get("Binux-Key"), "中文 value")

    def test_zzz_curl_put(self):
        status, newtasks, result = self.crawl(
            "curl '"
            + self.httpbin
            + """/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en,zh-CN;q=0.8,zh;q=0.6' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.17 Safari/537.36' -H 'Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryYlkgyaA7SRGOQYUG' -H 'Accept: */*' -H 'Cookie: _gauges_unique_year=1; _gauges_unique=1; _ga=GA1.2.415471573.1419316591' -H 'Connection: keep-alive' -H 'DNT: 1' --data-binary $'------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="Binux-Key"\r\n\r\n%E4%B8%AD%E6%96%87+value\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG\r\nContent-Disposition: form-data; name="fileUpload1"; filename="1"\r\nContent-Type: application/octet-stream\r\n\r\n\r\n------WebKitFormBoundaryYlkgyaA7SRGOQYUG--\r\n' --compressed""",
            callback=self.json,
        )
        self.assertStatusOk(status)
        self.assertTrue(result)

        self.assertIn("fileUpload1", result["files"], result)

    def test_zzz_curl_no_url(self):
        with self.assertRaisesRegexp(TypeError, "no URL"):
            status, newtasks, result = self.crawl(
                """curl -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' --compressed""",
                callback=self.json,
            )

    def test_zzz_curl_bad_option(self):
        with self.assertRaisesRegexp(TypeError, "Unknow curl option"):
            status, newtasks, result = self.crawl(
                """curl '%s/put' -X PUT -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' -v"""
                % self.httpbin,
                callback=self.json,
            )

        with self.assertRaisesRegexp(TypeError, "Unknow curl option"):
            status, newtasks, result = self.crawl(
                """curl '%s/put' -X PUT -v -H 'Origin: chrome-extension://hgmloofddffdnphfgcellkdfbfbjeloo' """
                % self.httpbin,
                callback=self.json,
            )
class TestFetcherProcessor(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        self.projectdb = ProjectDB([os.path.join(os.path.dirname(__file__), 'data_fetcher_processor_handler.py')])
        self.fetcher = Fetcher(None, None, async=False)
        self.status_queue = Queue()
        self.newtask_queue = Queue()
        self.result_queue = Queue()
        self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887)
        self.httpbin = 'http://127.0.0.1:14887'
        self.proxy_thread = subprocess.Popen(['pyproxy', '--username=binux',
                                              '--password=123456', '--port=14830',
                                              '--debug'], close_fds=True)
        self.proxy = '127.0.0.1:14830'
        self.processor = Processor(projectdb=self.projectdb,
                                   inqueue=None,
                                   status_queue=self.status_queue,
                                   newtask_queue=self.newtask_queue,
                                   result_queue=self.result_queue)
        self.project_name = 'data_fetcher_processor_handler'
        time.sleep(0.5)

    @classmethod
    def tearDownClass(self):
        self.proxy_thread.terminate()
        self.proxy_thread.wait()
        self.httpbin_thread.terminate()
        self.httpbin_thread.join()

    def crawl(self, url=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        assert not isinstance(task, list), 'url list is not allowed'
        task, result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result

    def status_ok(self, status, type):
        if not status:
            return False
        return status.get('track', {}).get(type, {}).get('ok', False)

    def assertStatusOk(self, status):
        self.assertTrue(self.status_ok(status, 'fetch'), status.get('track', {}).get('fetch'))
        self.assertTrue(self.status_ok(status, 'process'), status.get('track', {}).get('process'))

    def __getattr__(self, name):
        return name

    def test_10_not_status(self):
        status, newtasks, result = self.crawl(callback=self.not_send_status)

        self.assertIsNone(status)
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 'not_send_status')

    def test_20_url_deduplicated(self):
        status, newtasks, result = self.crawl(callback=self.url_deduplicated)

        self.assertStatusOk(status)
        self.assertIsNone(status['track']['fetch']['error'])
        self.assertIsNone(status['track']['fetch']['content'])
        self.assertFalse(status['track']['fetch']['headers'])
        self.assertFalse(status['track']['process']['logs'])
        self.assertEqual(len(newtasks), 2, newtasks)
        self.assertIsNone(result)

    def test_30_catch_status_code_error(self):
        status, newtasks, result = self.crawl(self.httpbin+'/status/418', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertIn('HTTP 418', status['track']['fetch']['error'])
        self.assertTrue(status['track']['fetch']['content'], '')
        self.assertTrue(status['track']['fetch']['headers'])
        self.assertTrue(status['track']['process']['logs'])
        self.assertIn('HTTPError: HTTP 418', status['track']['process']['logs'])
        self.assertFalse(newtasks)


        status, newtasks, result = self.crawl(self.httpbin+'/status/400', callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 400)

        status, newtasks, result = self.crawl(self.httpbin+'/status/500', callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 500)

        status, newtasks, result = self.crawl(self.httpbin+'/status/302',
                                              allow_redirects=False,
                                              callback=self.catch_http_error)
        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 302)

    def test_40_method(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delete', method='DELETE', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)

        status, newtasks, result = self.crawl(self.httpbin+'/get', method='PATCH',
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertTrue(newtasks)
        self.assertEqual(result, 405)

    def test_50_params(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get', params={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['args'], {'roy': 'binux', u'中文': '.'})

    def test_60_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post', data={
            'roy': 'binux',
            u'中文': '.',
        }, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux', u'中文': '.'})

    def test_70_redirect(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect-to?url=/get', callback=self.json)

        self.assertStatusOk(status)
        self.assertEqual(status['track']['fetch']['redirect_url'], self.httpbin+'/get')
        self.assertFalse(newtasks)

    def test_80_redirect_too_many(self):
        status, newtasks, result = self.crawl(self.httpbin+'/redirect/10', callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(status['track']['fetch']['status_code'], 599)
        self.assertIn('redirects followed', status['track']['fetch']['error'])

    def test_90_files(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a100_files_with_data(self):
        status, newtasks, result = self.crawl(self.httpbin+'/put', method='PUT',
                                              files={os.path.basename(__file__): open(__file__).read()},
                                              data={
                                                  'roy': 'binux',
                                                  #'中文': '.', # FIXME: not work
                                              },
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['form'], {'roy': 'binux'})
        self.assertIn(os.path.basename(__file__), result['files'])

    def test_a110_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'a': 'b',
                                                  'C-d': 'e-F',
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['headers'].get('A'), 'b')
        self.assertEqual(result['headers'].get('C-D'), 'e-F')

    def test_a120_cookies(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a130_cookies_with_headers(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              headers={
                                                  'Cookie': 'g=h; I=j',
                                              },
                                              cookies={
                                                  'a': 'b',
                                                  'C-d': 'e-F'
                                              }, callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertIn('g=h', result['headers'].get('Cookie'))
        self.assertIn('I=j', result['headers'].get('Cookie'))
        self.assertIn('a=b', result['headers'].get('Cookie'))
        self.assertIn('C-d=e-F', result['headers'].get('Cookie'))

    def test_a140_response_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.cookies)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'k1': 'v1', 'k2': 'v2'})

    def test_a145_redirect_cookie(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cookies/set?k1=v1&k2=v2',
                                              callback=self.json)
        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result['cookies'], {'k1': 'v1', 'k2': 'v2'})

    def test_a150_timeout(self):
        status, newtasks, result = self.crawl(self.httpbin+'/delay/2', timeout=1, callback=self.json)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertFalse(self.status_ok(status, 'process'))
        self.assertFalse(newtasks)
        self.assertEqual(int(status['track']['fetch']['time']), 1)

    def test_a160_etag(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', etag='abc', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a170_last_modifed(self):
        status, newtasks, result = self.crawl(self.httpbin+'/cache', last_modifed='0', callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a180_save(self):
        status, newtasks, result = self.crawl(callback=self.save, save={'roy': 'binux', u'中文': 'value'})

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, {'roy': 'binux', u'中文': 'value'})

    def test_a190_taskid(self):
        status, newtasks, result = self.crawl(callback=self.save, taskid='binux-taskid')

        self.assertStatusOk(status)
        self.assertEqual(status['taskid'], 'binux-taskid')
        self.assertFalse(newtasks)
        self.assertFalse(result)

    def test_a200_no_proxy(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a200'
                                              }, proxy=False, callback=self.json)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.fetcher.proxy = old_proxy

    def test_a210_proxy_failed(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a210'
                                              }, callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(len(newtasks), 1, newtasks)
        self.assertEqual(result, 403)
        self.fetcher.proxy = old_proxy

    def test_a220_proxy_ok(self):
        old_proxy = self.fetcher.proxy
        self.fetcher.proxy = self.proxy
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a220',
                                                  'username': '******',
                                                  'password': '******',
                                              }, callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)
        self.fetcher.proxy = old_proxy

    def test_a230_proxy_parameter_fail(self):
        status, newtasks, result = self.crawl(self.httpbin+'/get',
                                              params={
                                                  'test': 'a230',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertFalse(self.status_ok(status, 'fetch'))
        self.assertTrue(self.status_ok(status, 'process'))
        self.assertEqual(result, 403)

    def test_a240_proxy_parameter_ok(self):
        status, newtasks, result = self.crawl(self.httpbin+'/post',
                                              method='POST',
                                              data={
                                                  'test': 'a240',
                                                  'username': '******',
                                                  'password': '******',
                                              }, proxy=self.proxy,
                                              callback=self.catch_http_error)

        self.assertStatusOk(status)
        self.assertEqual(result, 200)

    def test_links(self):
        status, newtasks, result = self.crawl(self.httpbin+'/links/10/0', callback=self.links)

        self.assertStatusOk(status)
        self.assertEqual(len(newtasks), 9, newtasks)
        self.assertFalse(result)

    def test_html(self):
        status, newtasks, result = self.crawl(self.httpbin+'/html', callback=self.html)

        self.assertStatusOk(status)
        self.assertFalse(newtasks)
        self.assertEqual(result, 'Herman Melville - Moby-Dick')