Beispiel #1
0
    def test_a10_counter(self):
        for i in range(30):
            time.sleep(1)
            if self.rpc.counter('5m', 'sum')\
                    .get('test_project', {}).get('success', 0) > 5:
                break

        rv = self.app.get('/counter?time=5m&type=sum')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data['test_project']['success'], 3)

        rv = self.app.get('/counter?time=1h&type=sum')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data['test_project']['success'], 3)

        rv = self.app.get('/counter?time=1d&type=sum')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data['test_project']['success'], 3)

        rv = self.app.get('/counter?time=all&type=sum')
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data['test_project']['success'], 3)
Beispiel #2
0
    def test_a10_counter(self):
        for i in range(30):
            time.sleep(1)
            if self.rpc.counter("5m", "sum").get("test_project", {}).get("success", 0) > 5:
                break

        rv = self.app.get("/counter?time=5m&type=sum")
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data["test_project"]["success"], 3)

        rv = self.app.get("/counter?time=1h&type=sum")
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data["test_project"]["success"], 3)

        rv = self.app.get("/counter?time=1d&type=sum")
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data["test_project"]["success"], 3)

        rv = self.app.get("/counter?time=all&type=sum")
        self.assertEqual(rv.status_code, 200)
        data = json.loads(utils.text(rv.data))
        self.assertGreater(len(data), 0)
        self.assertGreater(data["test_project"]["success"], 3)
Beispiel #3
0
 def _parse(data):
     if six.PY3:
         for key, value in list(six.iteritems(data)):
             if isinstance(value, six.binary_type):
                 data[utils.text(key)] = utils.text(value)
             else:
                 data[utils.text(key)] = value
     return data
Beispiel #4
0
    def test_50_get(self):
        io = BytesIO()
        self.webdav.download('handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))
        io.close()

        io = BytesIO()
        self.webdav.download('sample_handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_sample_handler)), utils.text(io.getvalue()))
        io.close()
Beispiel #5
0
    def test_50_get(self):
        import easywebdav
        with self.assertRaises(easywebdav.OperationFailed):
            io = BytesIO()
            self.webdav.download('handler.py', io)
            io.close()

        io = BytesIO()
        self.webdav_up.download('handler.py', io)
        self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue()))
        io.close()
Beispiel #6
0
    def test_20_debug(self):
        rv = self.app.get("/debug/test_project")
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b"debugger", rv.data)
        self.assertIn(b"var task_content = ", rv.data)
        self.assertIn(b"var script_content = ", rv.data)

        m = re.search(r"var task_content = (.*);\n", utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.task_content = json.loads(m.group(1))
        m = re.search(r"var script_content = (.*);\n", utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.script_content = json.loads(m.group(1))
Beispiel #7
0
    def test_20_debug(self):
        rv = self.app.get('/debug/test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)
        self.assertIn(b'var task_content = ', rv.data)
        self.assertIn(b'var script_content = ', rv.data)

        m = re.search(r'var task_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('test_project', json.loads(m.group(1)))

        m = re.search(r'var script_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('__START_URL__', json.loads(m.group(1)))
Beispiel #8
0
    def test_20_debug(self):
        rv = self.app.get('/debug/test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)
        self.assertIn(b'var task_content = ', rv.data)
        self.assertIn(b'var script_content = ', rv.data)

        m = re.search(r'var task_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.task_content = json.loads(m.group(1))
        m = re.search(r'var script_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.script_content = (json.loads(m.group(1))
                                         .replace('http://scrapy.org/',
                                                  'http://127.0.0.1:14887/pyspider/test.html'))
Beispiel #9
0
    def test_20_debug(self):
        rv = self.app.get("/debug/test_project")
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b"debugger", rv.data)
        self.assertIn(b"var task_content = ", rv.data)
        self.assertIn(b"var script_content = ", rv.data)

        m = re.search(r"var task_content = (.*);\n", utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.task_content = json.loads(m.group(1))
        m = re.search(r"var script_content = (.*);\n", utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.script_content = json.loads(m.group(1)).replace(
            "http://scrapy.org/", "http://127.0.0.1:14887/pyspider/test.html"
        )
Beispiel #10
0
    def load_tasks(self, status, project=None, fields=None):
        if project is None:
            project = self.projects
        elif not isinstance(project, list):
            project = [project, ]

        if self.scan_available:
            scan_method = self.redis.sscan_iter
        else:
            scan_method = self.redis.smembers

        if fields:
            def get_method(key):
                obj = self.redis.hmget(key, fields)
                if all(x is None for x in obj):
                    return None
                return dict(zip(fields, obj))
        else:
            get_method = self.redis.hgetall

        for p in project:
            status_key = self._gen_status_key(p, status)
            for taskid in scan_method(status_key):
                obj = get_method(self._gen_key(p, utils.text(taskid)))
                if not obj:
                    #self.redis.srem(status_key, taskid)
                    continue
                else:
                    yield self._parse(obj)
Beispiel #11
0
 def test_90_run(self):
     time.sleep(0.5)
     rv = self.app.post('/run', data={
         'project': 'test_project',
     })
     self.assertEqual(rv.status_code, 200)
     self.assertEqual(json.loads(utils.text(rv.data))['result'], True)
Beispiel #12
0
    def _select2dic(self, tablename=None, what="*", where="", where_values=[],
                    order=None, offset=0, limit=None):
        tablename = self.escape(tablename or self.__tablename__)
        if isinstance(what, list) or isinstance(what, tuple) or what is None:
            what = ','.join(self.escape(f) for f in what) if what else '*'

        sql_query = "SELECT %s FROM %s" % (what, tablename)
        if where:
            sql_query += " WHERE %s" % where
        if order:
            sql_query += ' ORDER BY %s' % order
        if limit:
            sql_query += " LIMIT %d, %d" % (offset, limit)
        elif offset:
            sql_query += " LIMIT %d, %d" % (offset, self.maxlimit)
        logger.debug("<sql: %s>", sql_query)

        dbcur = self._execute(sql_query, where_values)

        # f[0] may return bytes type
        # https://github.com/mysql/mysql-connector-python/pull/37
        fields = [utils.text(f[0]) for f in dbcur.description]

        for row in dbcur:
            yield dict(zip(fields, row))
Beispiel #13
0
    def readonly(self):
        projectdb = self.app.config["projectdb"]
        if not projectdb:
            return True
        if (
            "lock" in projectdb.split_group(self.project.get("group"))
            and self.app.config.get("webui_username")
            and self.app.config.get("webui_password")
        ):

            authheader = self.environ.get("HTTP_AUTHORIZATION")
            if not authheader:
                return True
            authheader = authheader[len("Basic ") :]
            try:
                username, password = text(base64.b64decode(authheader)).split(":", 1)
            except Exception as e:
                self.app.logger.error("wrong api key: %r, %r", authheader, e)
                return True

            if username == self.app.config["webui_username"] and password == self.app.config["webui_password"]:
                return False
            else:
                return True
        return False
Beispiel #14
0
 def _parse(self, data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, (bytearray, six.binary_type)):
             data[key] = utils.text(value)
     if 'result' in data:
         data['result'] = json.loads(data['result'])
     return data
    def crawl(self, url=None, track=None, **kwargs):
        if url is None and kwargs.get('callback'):
            url = dataurl.encode(utils.text(kwargs.get('callback')))

        project_data = self.processor.project_manager.get(self.project_name)
        assert project_data, "can't find project: %s" % self.project_name
        instance = project_data['instance']
        instance._reset()
        task = instance.crawl(url, **kwargs)
        if isinstance(task, list):
            task = task[0]
        task['track'] = track
        result = self.fetcher.fetch(task)
        self.processor.on_task(task, result)

        status = None
        while not self.status_queue.empty():
            status = self.status_queue.get()
        newtasks = []
        while not self.newtask_queue.empty():
            newtasks = self.newtask_queue.get()
        result = None
        while not self.result_queue.empty():
            _, result = self.result_queue.get()
        return status, newtasks, result
Beispiel #16
0
 def doc(self):
     """Returns a PyQuery object of the response's content"""
     if hasattr(self, '_doc'):
         return self._doc
     elements = self.etree
     doc = self._doc = PyQuery(elements)
     doc.make_links_absolute(utils.text(self.url))
     return doc
Beispiel #17
0
 def test_35_run_http_task(self):
     rv = self.app.post('/debug/test_project/run', data={
         'script': self.script_content,
         'task': json.dumps(self.task_content2)
     })
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertIn(b'follows', rv.data)
Beispiel #18
0
 def test_33_run_bad_script(self):
     rv = self.app.post('/debug/test_project/run', data={
         'script': self.script_content+'adfasfasdf',
         'task': self.task_content
     })
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertGreater(len(data['logs']), 0)
     self.assertEqual(len(data['follows']), 0)
Beispiel #19
0
    def test_a20_tasks(self):
        rv = self.app.get('/tasks')
        self.assertEqual(rv.status_code, 200, rv.data)
        self.assertIn(b'SUCCESS</span>', rv.data)
        self.assertNotIn(b'>ERROR</span>', rv.data)
        m = re.search(r'/task/test_project:[^"]+', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.task_url = m.group(0)
        self.assertIsNotNone(self.task_url)
        m = re.search(r'/debug/test_project[^"]+', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.__class__.debug_task_url = m.group(0)
        self.assertIsNotNone(self.debug_task_url)

        rv = self.app.get('/tasks?project=test_project')
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'SUCCESS</span>', rv.data)
        self.assertNotIn(b'>ERROR</span>', rv.data)
Beispiel #20
0
 def _parse(data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, six.binary_type):
             data[key] = utils.text(value)
     if "result" in data:
         if isinstance(data["result"], bytearray):
             data["result"] = str(data["result"])
         data["result"] = json.loads(data["result"])
     return data
Beispiel #21
0
 def test_a15_queues(self):
     rv = self.app.get('/queues')
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertGreater(len(data), 0)
     self.assertIn('scheduler2fetcher', data)
     self.assertIn('fetcher2processor', data)
     self.assertIn('processor2result', data)
     self.assertIn('newtask_queue', data)
     self.assertIn('status_queue', data)
Beispiel #22
0
 def test_30_run(self):
     rv = self.app.post('/debug/test_project/run', data={
         'script': self.script_content,
         'task': self.task_content
     })
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertIn(b'follows', rv.data)
     self.assertGreater(len(data['follows']), 0)
     self.__class__.task_content2 = data['follows'][0]
Beispiel #23
0
    def __init__(self, path, environ, app, project=None):
        super(ScriptResource, self).__init__(path, environ)

        self.app = app
        self.new_project = False
        self._project = project
        self.project_name = text(self.name)
        self.writebuffer = None
        if self.project_name.endswith('.py'):
            self.project_name = self.project_name[:-len('.py')]
Beispiel #24
0
 def _parse(self, data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, (bytearray, six.binary_type)):
             data[key] = utils.text(value)
     for each in ('schedule', 'fetch', 'process', 'track'):
         if each in data:
             if data[each]:
                 data[each] = json.loads(data[each])
             else:
                 data[each] = {}
     return data
Beispiel #25
0
    def test_25_debug_post(self):
        rv = self.app.post('/debug/test_project', data={
            'project-name': 'other_project',
            'start-urls': 'http://127.0.0.1:14887/pyspider/test.html',
            'script-mode': 'script',
        })
        self.assertEqual(rv.status_code, 200)
        self.assertIn(b'debugger', rv.data)
        self.assertIn(b'var task_content = ', rv.data)
        self.assertIn(b'var script_content = ', rv.data)

        m = re.search(r'var task_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('test_project', m.group(1))
        self.__class__.task_content = json.loads(m.group(1))

        m = re.search(r'var script_content = (.*);\n', utils.text(rv.data))
        self.assertIsNotNone(m)
        self.assertIn('127.0.0.1:14887', m.group(1))
        self.__class__.script_content = json.loads(m.group(1))
Beispiel #26
0
 def test_45_run_with_saved_script(self):
     rv = self.app.post('/debug/test_project/run', data={
         'webdav_mode': 'true',
         'script': '',
         'task': self.task_content
     })
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertIn(b'follows', rv.data)
     self.assertGreater(len(data['follows']), 0)
     self.__class__.task_content2 = data['follows'][0]
Beispiel #27
0
    def _parse(self, data):
        if six.PY3:
            result = {}
            for key, value in data.items():
                if isinstance(value, bytes):
                    value = utils.text(value)
                result[utils.text(key)] = value
            data = result

        for each in ('schedule', 'fetch', 'process', 'track'):
            if each in data:
                if data[each]:
                    data[each] = json.loads(data[each])
                else:
                    data[each] = {}
        if 'status' in data:
            data['status'] = int(data['status'])
        if 'lastcrawltime' in data:
            data['lastcrawltime'] = float(data['lastcrawltime'] or 0)
        if 'updatetime' in data:
            data['updatetime'] = float(data['updatetime'] or 0)
        return data
Beispiel #28
0
 def _parse(data):
     for key, value in list(six.iteritems(data)):
         if isinstance(value, six.binary_type):
             data[key] = utils.text(value)
     for each in ("schedule", "fetch", "process", "track"):
         if each in data:
             if data[each]:
                 if isinstance(data[each], bytearray):
                     data[each] = str(data[each])
                 data[each] = json.loads(data[each])
             else:
                 data[each] = {}
     return data
Beispiel #29
0
 def test_a60_fetch_via_cannot_connect_fetcher(self):
     ctx = run.webui.make_context('webui', [
         '--fetcher-rpc', 'http://localhost:20000/',
     ], self.ctx)
     app = run.webui.invoke(ctx)
     app = app.test_client()
     rv = app.post('/debug/test_project/run', data={
         'script': self.script_content,
         'task': self.task_content
     })
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertGreater(len(data['logs']), 0)
     self.assertEqual(len(data['follows']), 0)
Beispiel #30
0
 def getMemberList(self):
     members = []
     for project in self.projectdb.get_all():
         project_name = project['name']
         if not project_name.endswith('.py'):
             project_name += '.py'
         native_path = os.path.join(self.path, project_name)
         native_path = text(native_path) if six.PY3 else utf8(native_path)
         members.append(ScriptResource(
             native_path,
             self.environ,
             self.app,
             project
         ))
     return members
Beispiel #31
0
 def test_a25_task_json(self):
     rv = self.app.get(self.task_url + '.json')
     self.assertEqual(rv.status_code, 200)
     self.assertIn('status_string', json.loads(utils.text(rv.data)))
Beispiel #32
0
def search_blob_demo():
    # 连接配置信息
    mysql_config = {
        'host': '192.168.1.244',
        'port': 3306,
        'user': '******',
        'password': '******',
        'db': 'resultdb',
        'charset': 'utf8',
        'cursorclass': pymysql.cursors.DictCursor,
    }
    # 创建连接
    connection = pymysql.connect(**mysql_config)
    # 执行sql语句
    try:
        with connection.cursor() as cursor:
            # 执行sql语句,进行查询
            sql = 'select * from boohee6'
            # 获取查询结果
            cursor.execute(sql)
            data = cursor.fetchall()
            wb = Workbook()
            # 激活 worksheet
            sheet = wb.create_sheet('薄荷网', 0)
            # 可以附加行,从第一列开始附加
            sheet.append(title_row_all)
            num = 1
            for j in range(len(SEARCH_LIST2)):
                search_key = SEARCH_LIST2[j]
                for i in range(len(data)):
                    for key, value in list(six.iteritems(data[i])):
                        if isinstance(value, (bytearray, six.binary_type)):
                            # data[key] = value.decode('utf8')
                            # 作用同上
                            data[i][key] = utils.text(value)
                    if 'result' in data[i]:
                        result = data[i]['result']
                        # 判断result类型为str
                        if isinstance(result, str):
                            data[i]['result'] = json.loads(data[i]['result'])
                    # 判断search_key 精准匹配name
                    if search_key != data[i]['result']['name']:
                        continue
                    else:
                        # 过滤type_row数据
                        if data[i]['result']['type'] in type_row:
                            continue
                        else:
                            search_key_title = []
                            search_key_title.append(search_key)
                            sheet.append(search_key_title)
                            num += 1
                            # 获取title行
                            title_cell = sheet.cell(None, num, 1)
                            title_cell.font = font1
                            # merge cells from num row
                            sheet.merge_cells(None, num, 1, num, 3)

                            contents = data[i]['result']['contents'].split(
                                '>>')[0].strip().split(' ')[2:]
                            content_all = []
                            for s in range(len(title_row)):
                                if title_row[s] in contents:
                                    for z in range(len(contents)):
                                        if title_row[s] == contents[z]:
                                            content_all.append('' if contents[
                                                z + 1] == '一' else contents[z +
                                                                            1])
                                else:
                                    content_all.append('')
                            content_all.insert(0, data[i]['result']['name'])
                            content_all.insert(1, data[i]['result']['type'])
                            sheet.append(content_all)
                            num += 1
            # 保存文件
            wb.save("薄荷网食物data-precise1.xlsx")

        # 没有设置默认自动提交,需要主动提交,以保存所执行的语句
        connection.commit()
    finally:
        connection.close()
Beispiel #33
0
 def test_70_get(self):
     io = BytesIO()
     self.webdav.download('sample_handler.py', io)
     self.assertEqual(utils.text(inspect.getsource(data_handler)),
                      utils.text(io.getvalue()))
     io.close()
Beispiel #34
0
    def on_task(self, task, response):
        start_time = time.time()
        try:
            response = rebuild_response(response)
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('updatetime', None)
            project_data = self.project_manager.get(project, updatetime)
            if not project_data:
                logger.error("no such project: %s", project)
                return False
            ret = project_data['instance'].run(project_data['module'], task,
                                               response)
        except Exception as e:
            logger.exception(e)
            return False
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok':
                        response.isok(),
                        'redirect_url':
                        response.url
                        if response.url != response.orig_url else None,
                        'time':
                        response.time,
                        'error':
                        response.error,
                        'status_code':
                        response.status_code,
                        'encoding':
                        response.encoding,
                        'headers':
                        track_headers,
                        'content':
                        response.content[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok':
                        not ret.exception,
                        'time':
                        process_time,
                        'follows':
                        len(ret.follows),
                        'result': (None if ret.result is None else utils.text(
                            ret.result)[:self.RESULT_RESULT_LIMIT]),
                        'logs':
                        ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception':
                        ret.exception,
                    },
                },
            }

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            self.newtask_queue.put(
                [utils.unicode_obj(newtask) for newtask in ret.follows])

        for project, msg, url in ret.messages:
            self.inqueue.put(({
                'taskid': utils.md5string(url),
                'project': project,
                'url': url,
                'process': {
                    'callback': '_on_message',
                }
            }, {
                'status_code': 200,
                'url': url,
                'save': (task['project'], msg),
            }))

        if response.error or ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func(
            'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r'
            % (task['project'], task['taskid'], task.get('url'),
               response.status_code, len(response.content), ret.result,
               len(ret.follows), len(ret.messages), ret.exception))
        return True
Beispiel #35
0
 def test_x30_run(self):
     rv = self.app.post('/run', data={
         'project': 'test_project',
     })
     self.assertEqual(rv.status_code, 200)
     self.assertEqual(json.loads(utils.text(rv.data))['result'], False)
Beispiel #36
0
    def on_task(self, task, response):
        '''Deal one task'''
        start_time = time.time()
        response = rebuild_response(response)

        try:
            assert 'taskid' in task, 'need taskid in task'
            project = task['project']
            updatetime = task.get('project_updatetime', None)
            md5sum = task.get('project_md5sum', None)
            project_data = self.project_manager.get(project, updatetime, md5sum)
            assert project_data, "no such project!"
            if project_data.get('exception'):
                ret = ProcessorResult(logs=(project_data.get('exception_log'), ),
                                      exception=project_data['exception'])
            else:
                ret = project_data['instance'].run_task(
                    project_data['module'], task, response)
        except Exception as e:
            logstr = traceback.format_exc()
            ret = ProcessorResult(logs=(logstr, ), exception=e)
        process_time = time.time() - start_time

        if not ret.extinfo.get('not_send_status', False):
            if ret.exception:
                track_headers = dict(response.headers)
            else:
                track_headers = {}
                for name in ('etag', 'last-modified'):
                    if name not in response.headers:
                        continue
                    track_headers[name] = response.headers[name]

            status_pack = {
                'taskid': task['taskid'],
                'project': task['project'],
                'url': task.get('url'),
                'track': {
                    'fetch': {
                        'ok': response.isok(),
                        'redirect_url': response.url if response.url != response.orig_url else None,
                        'time': response.time,
                        'error': response.error,
                        'status_code': response.status_code,
                        'encoding': getattr(response, '_encoding', None),
                        'headers': track_headers,
                        'content': response.text[:500] if ret.exception else None,
                    },
                    'process': {
                        'ok': not ret.exception,
                        'time': process_time,
                        'follows': len(ret.follows),
                        'result': (
                            None if ret.result is None
                            else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT]
                        ),
                        'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:],
                        'exception': ret.exception,
                    },
                    'save': ret.save,
                },
            }
            if 'schedule' in task:
                status_pack['schedule'] = task['schedule']

            # FIXME: unicode_obj should used in scheduler before store to database
            # it's used here for performance.
            self.status_queue.put(utils.unicode_obj(status_pack))

        # FIXME: unicode_obj should used in scheduler before store to database
        # it's used here for performance.
        if ret.follows:
            for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)):
                self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each])

        for project, msg, url in ret.messages:
            try:
                self.on_task({
                    'taskid': utils.md5string(url),
                    'project': project,
                    'url': url,
                    'process': {
                        'callback': '_on_message',
                    }
                }, {
                    'status_code': 200,
                    'url': url,
                    'save': (task['project'], msg),
                })
            except Exception as e:
                logger.exception('Sending message error.')
                continue

        if ret.exception:
            logger_func = logger.error
        else:
            logger_func = logger.info
        logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (
            task['project'], task['taskid'],
            task.get('url'), response.status_code, len(response.content),
            ret.result, len(ret.follows), len(ret.messages), ret.exception))
        return True
Beispiel #37
0
    def http_fetch(self, url, task):
        '''HTTP fetcher'''
        start_time = time.time()
        self.on_fetch('http', task)
        handle_error = lambda x: self.handle_error('http', url, task,
                                                   start_time, x)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})

        session = cookies.RequestsCookieJar()
        # fix for tornado request obj
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        max_redirects = task_fetch.get('max_redirects', 5)
        # we will handle redirects by hand to capture cookies
        fetch['follow_redirects'] = False

        # making requests
        while True:
            # robots.txt
            if task_fetch.get('robots_txt', False):
                can_fetch = yield self.can_fetch(
                    fetch['headers']['User-Agent'], fetch['url'])
                if not can_fetch:
                    error = tornado.httpclient.HTTPError(
                        403, 'Disallowed by robots.txt')
                    raise gen.Return(handle_error(error))

            try:
                request = tornado.httpclient.HTTPRequest(**fetch)
                # if cookie already in header, get_cookie_header wouldn't work
                old_cookie_header = request.headers.get('Cookie')
                if old_cookie_header:
                    del request.headers['Cookie']
                cookie_header = cookies.get_cookie_header(session, request)
                if cookie_header:
                    request.headers['Cookie'] = cookie_header
                elif old_cookie_header:
                    request.headers['Cookie'] = old_cookie_header
            except Exception as e:
                logger.exception(fetch)
                raise gen.Return(handle_error(e))

            try:
                response = yield gen.maybe_future(
                    self.http_client.fetch(request))
            except tornado.httpclient.HTTPError as e:
                if e.response:
                    response = e.response
                else:
                    raise gen.Return(handle_error(e))

            extract_cookies_to_jar(session, response.request, response.headers)
            if (response.code in (301, 302, 303, 307)
                    and response.headers.get('Location')
                    and task_fetch.get('allow_redirects', True)):
                if max_redirects <= 0:
                    error = tornado.httpclient.HTTPError(
                        599, 'Maximum (%d) redirects followed' %
                        task_fetch.get('max_redirects', 5), response)
                    raise gen.Return(handle_error(error))
                if response.code in (302, 303):
                    fetch['method'] = 'GET'
                    if 'body' in fetch:
                        del fetch['body']
                fetch['url'] = quote_chinese(
                    urljoin(fetch['url'], response.headers['Location']))
                fetch['request_timeout'] -= time.time() - start_time
                if fetch['request_timeout'] < 0:
                    fetch['request_timeout'] = 0.1
                max_redirects -= 1
                continue

            result = {}
            result['orig_url'] = url
            result['content'] = response.body or ''
            result['headers'] = dict(response.headers)
            result['status_code'] = response.code
            result['url'] = response.effective_url or url
            result['time'] = time.time() - start_time
            result['cookies'] = session.get_dict()
            result['save'] = task_fetch.get('save')
            if response.error:
                result['error'] = utils.text(response.error)
            if 200 <= response.code < 300:
                logger.info("[%d] %s:%s %s %.2fs", response.code,
                            task.get('project'), task.get('taskid'), url,
                            result['time'])
            else:
                logger.warning("[%d] %s:%s %s %.2fs", response.code,
                               task.get('project'), task.get('taskid'), url,
                               result['time'])

            raise gen.Return(result)
Beispiel #38
0
    def puppeteer_fetch(self, url, task):
        '''Fetch with puppeteer proxy'''
        start_time = time.time()
        self.on_fetch('puppeteer', task)
        handle_error = lambda x: self.handle_error('puppeteer', url, task,
                                                   start_time, x)

        # check puppeteer proxy is enabled
        if not self.puppeteer_proxy:
            result = {
                "orig_url": url,
                "content": "puppeteer is not enabled.",
                "headers": {},
                "status_code": 501,
                "url": url,
                "time": time.time() - start_time,
                "cookies": {},
                "save": task.get('fetch', {}).get('save')
            }
            logger.warning("[501] %s:%s %s 0s", task.get('project'),
                           task.get('taskid'), url)
            raise gen.Return(result)

        # setup request parameters
        fetch = self.pack_tornado_request_parameters(url, task)
        task_fetch = task.get('fetch', {})
        for each in task_fetch:
            if each not in fetch:
                fetch[each] = task_fetch[each]
        fetch['headless'] = "false" if "headless" not in fetch else fetch[
            'headless']
        # robots.txt
        if task_fetch.get('robots_txt', False):
            user_agent = fetch['headers']['User-Agent']
            can_fetch = yield self.can_fetch(user_agent, url)
            if not can_fetch:
                error = tornado.httpclient.HTTPError(
                    403, 'Disallowed by robots.txt')
                raise gen.Return(handle_error(error))

        request_conf = {'follow_redirects': False}
        request_conf['connect_timeout'] = fetch.get('connect_timeout', 20)
        request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1

        session = cookies.RequestsCookieJar()
        if 'Cookie' in fetch['headers']:
            c = http_cookies.SimpleCookie()
            try:
                c.load(fetch['headers']['Cookie'])
            except AttributeError:
                c.load(utils.utf8(fetch['headers']['Cookie']))
            for key in c:
                session.set(key, c[key])
            del fetch['headers']['Cookie']
        if 'cookies' in fetch:
            session.update(fetch['cookies'])
            del fetch['cookies']

        request = tornado.httpclient.HTTPRequest(url=fetch['url'])
        cookie_header = cookies.get_cookie_header(session, request)
        if cookie_header:
            fetch['headers']['Cookie'] = cookie_header

        logger.info("%s", self.puppeteer_proxy)
        # making requests
        fetch['headers'] = dict(fetch['headers'])
        headers = {}
        headers['Content-Type'] = 'application/json; charset=UTF-8'
        try:
            request = tornado.httpclient.HTTPRequest(url=self.puppeteer_proxy,
                                                     method="POST",
                                                     headers=headers,
                                                     body=json.dumps(fetch),
                                                     **request_conf)
        except Exception as e:
            raise gen.Return(handle_error(e))

        try:
            response = yield gen.maybe_future(self.http_client.fetch(request))
        except tornado.httpclient.HTTPError as e:
            if e.response:
                response = e.response
            else:
                raise gen.Return(handle_error(e))

        if not response.body:
            raise gen.Return(
                handle_error(
                    Exception('no response from puppeteer: %r' % response)))

        result = {}
        try:
            result = json.loads(utils.text(response.body))
            assert 'status_code' in result, result
        except Exception as e:
            if response.error:
                result['error'] = utils.text(response.error)
            raise gen.Return(handle_error(e))

        if result.get('status_code', 200):
            logger.info("[%d] %s:%s %s %.2fs", result['status_code'],
                        task.get('project'), task.get('taskid'), url,
                        result['time'])
        else:
            logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'],
                         task.get('project'), task.get('taskid'), url,
                         result['content'], result['time'])

        raise gen.Return(result)
Beispiel #39
0
 def test_42_get(self):
     rv = self.app.get('/debug/test_project/get')
     self.assertEqual(rv.status_code, 200)
     data = json.loads(utils.text(rv.data))
     self.assertIn('script', data)
     self.assertEqual(data['script'], self.script_content)
Beispiel #40
0
 def test_x20_counter(self):
     rv = self.app.get('/counter?time=5m&type=sum')
     self.assertEqual(rv.status_code, 200)
     self.assertEqual(json.loads(utils.text(rv.data)), {})