def test_a10_counter(self): for i in range(30): time.sleep(1) if self.rpc.counter('5m', 'sum')\ .get('test_project', {}).get('success', 0) > 5: break rv = self.app.get('/counter?time=5m&type=sum') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data['test_project']['success'], 3) rv = self.app.get('/counter?time=1h&type=sum') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data['test_project']['success'], 3) rv = self.app.get('/counter?time=1d&type=sum') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data['test_project']['success'], 3) rv = self.app.get('/counter?time=all&type=sum') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data['test_project']['success'], 3)
def test_a10_counter(self): for i in range(30): time.sleep(1) if self.rpc.counter("5m", "sum").get("test_project", {}).get("success", 0) > 5: break rv = self.app.get("/counter?time=5m&type=sum") self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data["test_project"]["success"], 3) rv = self.app.get("/counter?time=1h&type=sum") self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data["test_project"]["success"], 3) rv = self.app.get("/counter?time=1d&type=sum") self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data["test_project"]["success"], 3) rv = self.app.get("/counter?time=all&type=sum") self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertGreater(data["test_project"]["success"], 3)
def _parse(data): if six.PY3: for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[utils.text(key)] = utils.text(value) else: data[utils.text(key)] = value return data
def test_50_get(self): io = BytesIO() self.webdav.download('handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close() io = BytesIO() self.webdav.download('sample_handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_sample_handler)), utils.text(io.getvalue())) io.close()
def test_50_get(self): import easywebdav with self.assertRaises(easywebdav.OperationFailed): io = BytesIO() self.webdav.download('handler.py', io) io.close() io = BytesIO() self.webdav_up.download('handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close()
def test_20_debug(self): rv = self.app.get("/debug/test_project") self.assertEqual(rv.status_code, 200) self.assertIn(b"debugger", rv.data) self.assertIn(b"var task_content = ", rv.data) self.assertIn(b"var script_content = ", rv.data) m = re.search(r"var task_content = (.*);\n", utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.task_content = json.loads(m.group(1)) m = re.search(r"var script_content = (.*);\n", utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.script_content = json.loads(m.group(1))
def test_20_debug(self): rv = self.app.get('/debug/test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) self.assertIn(b'var task_content = ', rv.data) self.assertIn(b'var script_content = ', rv.data) m = re.search(r'var task_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('test_project', json.loads(m.group(1))) m = re.search(r'var script_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('__START_URL__', json.loads(m.group(1)))
def test_20_debug(self): rv = self.app.get('/debug/test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) self.assertIn(b'var task_content = ', rv.data) self.assertIn(b'var script_content = ', rv.data) m = re.search(r'var task_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.task_content = json.loads(m.group(1)) m = re.search(r'var script_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.script_content = (json.loads(m.group(1)) .replace('http://scrapy.org/', 'http://127.0.0.1:14887/pyspider/test.html'))
def test_20_debug(self): rv = self.app.get("/debug/test_project") self.assertEqual(rv.status_code, 200) self.assertIn(b"debugger", rv.data) self.assertIn(b"var task_content = ", rv.data) self.assertIn(b"var script_content = ", rv.data) m = re.search(r"var task_content = (.*);\n", utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.task_content = json.loads(m.group(1)) m = re.search(r"var script_content = (.*);\n", utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.script_content = json.loads(m.group(1)).replace( "http://scrapy.org/", "http://127.0.0.1:14887/pyspider/test.html" )
def load_tasks(self, status, project=None, fields=None): if project is None: project = self.projects elif not isinstance(project, list): project = [project, ] if self.scan_available: scan_method = self.redis.sscan_iter else: scan_method = self.redis.smembers if fields: def get_method(key): obj = self.redis.hmget(key, fields) if all(x is None for x in obj): return None return dict(zip(fields, obj)) else: get_method = self.redis.hgetall for p in project: status_key = self._gen_status_key(p, status) for taskid in scan_method(status_key): obj = get_method(self._gen_key(p, utils.text(taskid))) if not obj: #self.redis.srem(status_key, taskid) continue else: yield self._parse(obj)
def test_90_run(self): time.sleep(0.5) rv = self.app.post('/run', data={ 'project': 'test_project', }) self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data))['result'], True)
def _select2dic(self, tablename=None, what="*", where="", where_values=[], order=None, offset=0, limit=None): tablename = self.escape(tablename or self.__tablename__) if isinstance(what, list) or isinstance(what, tuple) or what is None: what = ','.join(self.escape(f) for f in what) if what else '*' sql_query = "SELECT %s FROM %s" % (what, tablename) if where: sql_query += " WHERE %s" % where if order: sql_query += ' ORDER BY %s' % order if limit: sql_query += " LIMIT %d, %d" % (offset, limit) elif offset: sql_query += " LIMIT %d, %d" % (offset, self.maxlimit) logger.debug("<sql: %s>", sql_query) dbcur = self._execute(sql_query, where_values) # f[0] may return bytes type # https://github.com/mysql/mysql-connector-python/pull/37 fields = [utils.text(f[0]) for f in dbcur.description] for row in dbcur: yield dict(zip(fields, row))
def readonly(self): projectdb = self.app.config["projectdb"] if not projectdb: return True if ( "lock" in projectdb.split_group(self.project.get("group")) and self.app.config.get("webui_username") and self.app.config.get("webui_password") ): authheader = self.environ.get("HTTP_AUTHORIZATION") if not authheader: return True authheader = authheader[len("Basic ") :] try: username, password = text(base64.b64decode(authheader)).split(":", 1) except Exception as e: self.app.logger.error("wrong api key: %r, %r", authheader, e) return True if username == self.app.config["webui_username"] and password == self.app.config["webui_password"]: return False else: return True return False
def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) if 'result' in data: data['result'] = json.loads(data['result']) return data
def crawl(self, url=None, track=None, **kwargs): if url is None and kwargs.get('callback'): url = dataurl.encode(utils.text(kwargs.get('callback'))) project_data = self.processor.project_manager.get(self.project_name) assert project_data, "can't find project: %s" % self.project_name instance = project_data['instance'] instance._reset() task = instance.crawl(url, **kwargs) if isinstance(task, list): task = task[0] task['track'] = track result = self.fetcher.fetch(task) self.processor.on_task(task, result) status = None while not self.status_queue.empty(): status = self.status_queue.get() newtasks = [] while not self.newtask_queue.empty(): newtasks = self.newtask_queue.get() result = None while not self.result_queue.empty(): _, result = self.result_queue.get() return status, newtasks, result
def doc(self): """Returns a PyQuery object of the response's content""" if hasattr(self, '_doc'): return self._doc elements = self.etree doc = self._doc = PyQuery(elements) doc.make_links_absolute(utils.text(self.url)) return doc
def test_35_run_http_task(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': json.dumps(self.task_content2) }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn(b'follows', rv.data)
def test_33_run_bad_script(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content+'adfasfasdf', 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data['logs']), 0) self.assertEqual(len(data['follows']), 0)
def test_a20_tasks(self): rv = self.app.get('/tasks') self.assertEqual(rv.status_code, 200, rv.data) self.assertIn(b'SUCCESS</span>', rv.data) self.assertNotIn(b'>ERROR</span>', rv.data) m = re.search(r'/task/test_project:[^"]+', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.task_url = m.group(0) self.assertIsNotNone(self.task_url) m = re.search(r'/debug/test_project[^"]+', utils.text(rv.data)) self.assertIsNotNone(m) self.__class__.debug_task_url = m.group(0) self.assertIsNotNone(self.debug_task_url) rv = self.app.get('/tasks?project=test_project') self.assertEqual(rv.status_code, 200) self.assertIn(b'SUCCESS</span>', rv.data) self.assertNotIn(b'>ERROR</span>', rv.data)
def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) if "result" in data: if isinstance(data["result"], bytearray): data["result"] = str(data["result"]) data["result"] = json.loads(data["result"]) return data
def test_a15_queues(self): rv = self.app.get('/queues') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data), 0) self.assertIn('scheduler2fetcher', data) self.assertIn('fetcher2processor', data) self.assertIn('processor2result', data) self.assertIn('newtask_queue', data) self.assertIn('status_queue', data)
def test_30_run(self): rv = self.app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn(b'follows', rv.data) self.assertGreater(len(data['follows']), 0) self.__class__.task_content2 = data['follows'][0]
def __init__(self, path, environ, app, project=None): super(ScriptResource, self).__init__(path, environ) self.app = app self.new_project = False self._project = project self.project_name = text(self.name) self.writebuffer = None if self.project_name.endswith('.py'): self.project_name = self.project_name[:-len('.py')]
def _parse(self, data): for key, value in list(six.iteritems(data)): if isinstance(value, (bytearray, six.binary_type)): data[key] = utils.text(value) for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} return data
def test_25_debug_post(self): rv = self.app.post('/debug/test_project', data={ 'project-name': 'other_project', 'start-urls': 'http://127.0.0.1:14887/pyspider/test.html', 'script-mode': 'script', }) self.assertEqual(rv.status_code, 200) self.assertIn(b'debugger', rv.data) self.assertIn(b'var task_content = ', rv.data) self.assertIn(b'var script_content = ', rv.data) m = re.search(r'var task_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('test_project', m.group(1)) self.__class__.task_content = json.loads(m.group(1)) m = re.search(r'var script_content = (.*);\n', utils.text(rv.data)) self.assertIsNotNone(m) self.assertIn('127.0.0.1:14887', m.group(1)) self.__class__.script_content = json.loads(m.group(1))
def test_45_run_with_saved_script(self): rv = self.app.post('/debug/test_project/run', data={ 'webdav_mode': 'true', 'script': '', 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn(b'follows', rv.data) self.assertGreater(len(data['follows']), 0) self.__class__.task_content2 = data['follows'][0]
def _parse(self, data): if six.PY3: result = {} for key, value in data.items(): if isinstance(value, bytes): value = utils.text(value) result[utils.text(key)] = value data = result for each in ('schedule', 'fetch', 'process', 'track'): if each in data: if data[each]: data[each] = json.loads(data[each]) else: data[each] = {} if 'status' in data: data['status'] = int(data['status']) if 'lastcrawltime' in data: data['lastcrawltime'] = float(data['lastcrawltime'] or 0) if 'updatetime' in data: data['updatetime'] = float(data['updatetime'] or 0) return data
def _parse(data): for key, value in list(six.iteritems(data)): if isinstance(value, six.binary_type): data[key] = utils.text(value) for each in ("schedule", "fetch", "process", "track"): if each in data: if data[each]: if isinstance(data[each], bytearray): data[each] = str(data[each]) data[each] = json.loads(data[each]) else: data[each] = {} return data
def test_a60_fetch_via_cannot_connect_fetcher(self): ctx = run.webui.make_context('webui', [ '--fetcher-rpc', 'http://localhost:20000/', ], self.ctx) app = run.webui.invoke(ctx) app = app.test_client() rv = app.post('/debug/test_project/run', data={ 'script': self.script_content, 'task': self.task_content }) self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertGreater(len(data['logs']), 0) self.assertEqual(len(data['follows']), 0)
def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' native_path = os.path.join(self.path, project_name) native_path = text(native_path) if six.PY3 else utf8(native_path) members.append(ScriptResource( native_path, self.environ, self.app, project )) return members
def test_a25_task_json(self): rv = self.app.get(self.task_url + '.json') self.assertEqual(rv.status_code, 200) self.assertIn('status_string', json.loads(utils.text(rv.data)))
def search_blob_demo(): # 连接配置信息 mysql_config = { 'host': '192.168.1.244', 'port': 3306, 'user': '******', 'password': '******', 'db': 'resultdb', 'charset': 'utf8', 'cursorclass': pymysql.cursors.DictCursor, } # 创建连接 connection = pymysql.connect(**mysql_config) # 执行sql语句 try: with connection.cursor() as cursor: # 执行sql语句,进行查询 sql = 'select * from boohee6' # 获取查询结果 cursor.execute(sql) data = cursor.fetchall() wb = Workbook() # 激活 worksheet sheet = wb.create_sheet('薄荷网', 0) # 可以附加行,从第一列开始附加 sheet.append(title_row_all) num = 1 for j in range(len(SEARCH_LIST2)): search_key = SEARCH_LIST2[j] for i in range(len(data)): for key, value in list(six.iteritems(data[i])): if isinstance(value, (bytearray, six.binary_type)): # data[key] = value.decode('utf8') # 作用同上 data[i][key] = utils.text(value) if 'result' in data[i]: result = data[i]['result'] # 判断result类型为str if isinstance(result, str): data[i]['result'] = json.loads(data[i]['result']) # 判断search_key 精准匹配name if search_key != data[i]['result']['name']: continue else: # 过滤type_row数据 if data[i]['result']['type'] in type_row: continue else: search_key_title = [] search_key_title.append(search_key) sheet.append(search_key_title) num += 1 # 获取title行 title_cell = sheet.cell(None, num, 1) title_cell.font = font1 # merge cells from num row sheet.merge_cells(None, num, 1, num, 3) contents = data[i]['result']['contents'].split( '>>')[0].strip().split(' ')[2:] content_all = [] for s in range(len(title_row)): if title_row[s] in contents: for z in range(len(contents)): if title_row[s] == contents[z]: content_all.append('' if contents[ z + 1] == '一' else contents[z + 1]) else: content_all.append('') content_all.insert(0, data[i]['result']['name']) content_all.insert(1, data[i]['result']['type']) sheet.append(content_all) num += 1 # 保存文件 wb.save("薄荷网食物data-precise1.xlsx") # 没有设置默认自动提交,需要主动提交,以保存所执行的语句 connection.commit() finally: connection.close()
def test_70_get(self): io = BytesIO() self.webdav.download('sample_handler.py', io) self.assertEqual(utils.text(inspect.getsource(data_handler)), utils.text(io.getvalue())) io.close()
def on_task(self, task, response): start_time = time.time() try: response = rebuild_response(response) assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('updatetime', None) project_data = self.project_manager.get(project, updatetime) if not project_data: logger.error("no such project: %s", project) return False ret = project_data['instance'].run(project_data['module'], task, response) except Exception as e: logger.exception(e) return False process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': response.encoding, 'headers': track_headers, 'content': response.content[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': (None if ret.result is None else utils.text( ret.result)[:self.RESULT_RESULT_LIMIT]), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, }, } # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: self.newtask_queue.put( [utils.unicode_obj(newtask) for newtask in ret.follows]) for project, msg, url in ret.messages: self.inqueue.put(({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), })) if response.error or ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func( 'process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % (task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def test_x30_run(self): rv = self.app.post('/run', data={ 'project': 'test_project', }) self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data))['result'], False)
def on_task(self, task, response): '''Deal one task''' start_time = time.time() response = rebuild_response(response) try: assert 'taskid' in task, 'need taskid in task' project = task['project'] updatetime = task.get('project_updatetime', None) md5sum = task.get('project_md5sum', None) project_data = self.project_manager.get(project, updatetime, md5sum) assert project_data, "no such project!" if project_data.get('exception'): ret = ProcessorResult(logs=(project_data.get('exception_log'), ), exception=project_data['exception']) else: ret = project_data['instance'].run_task( project_data['module'], task, response) except Exception as e: logstr = traceback.format_exc() ret = ProcessorResult(logs=(logstr, ), exception=e) process_time = time.time() - start_time if not ret.extinfo.get('not_send_status', False): if ret.exception: track_headers = dict(response.headers) else: track_headers = {} for name in ('etag', 'last-modified'): if name not in response.headers: continue track_headers[name] = response.headers[name] status_pack = { 'taskid': task['taskid'], 'project': task['project'], 'url': task.get('url'), 'track': { 'fetch': { 'ok': response.isok(), 'redirect_url': response.url if response.url != response.orig_url else None, 'time': response.time, 'error': response.error, 'status_code': response.status_code, 'encoding': getattr(response, '_encoding', None), 'headers': track_headers, 'content': response.text[:500] if ret.exception else None, }, 'process': { 'ok': not ret.exception, 'time': process_time, 'follows': len(ret.follows), 'result': ( None if ret.result is None else utils.text(ret.result)[:self.RESULT_RESULT_LIMIT] ), 'logs': ret.logstr()[-self.RESULT_LOGS_LIMIT:], 'exception': ret.exception, }, 'save': ret.save, }, } if 'schedule' in task: status_pack['schedule'] = task['schedule'] # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. self.status_queue.put(utils.unicode_obj(status_pack)) # FIXME: unicode_obj should used in scheduler before store to database # it's used here for performance. if ret.follows: for each in (ret.follows[x:x + 1000] for x in range(0, len(ret.follows), 1000)): self.newtask_queue.put([utils.unicode_obj(newtask) for newtask in each]) for project, msg, url in ret.messages: try: self.on_task({ 'taskid': utils.md5string(url), 'project': project, 'url': url, 'process': { 'callback': '_on_message', } }, { 'status_code': 200, 'url': url, 'save': (task['project'], msg), }) except Exception as e: logger.exception('Sending message error.') continue if ret.exception: logger_func = logger.error else: logger_func = logger.info logger_func('process %s:%s %s -> [%d] len:%d -> result:%.10r fol:%d msg:%d err:%r' % ( task['project'], task['taskid'], task.get('url'), response.status_code, len(response.content), ret.result, len(ret.follows), len(ret.messages), ret.exception)) return True
def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch( fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError( 403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future( self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese( urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result)
def puppeteer_fetch(self, url, task): '''Fetch with puppeteer proxy''' start_time = time.time() self.on_fetch('puppeteer', task) handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x) # check puppeteer proxy is enabled if not self.puppeteer_proxy: result = { "orig_url": url, "content": "puppeteer is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] fetch['headless'] = "false" if "headless" not in fetch else fetch[ 'headless'] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError( 403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = {'follow_redirects': False} request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header logger.info("%s", self.puppeteer_proxy) # making requests fetch['headers'] = dict(fetch['headers']) headers = {} headers['Content-Type'] = 'application/json; charset=UTF-8' try: request = tornado.httpclient.HTTPRequest(url=self.puppeteer_proxy, method="POST", headers=headers, body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return( handle_error( Exception('no response from puppeteer: %r' % response))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result)
def test_42_get(self): rv = self.app.get('/debug/test_project/get') self.assertEqual(rv.status_code, 200) data = json.loads(utils.text(rv.data)) self.assertIn('script', data) self.assertEqual(data['script'], self.script_content)
def test_x20_counter(self): rv = self.app.get('/counter?time=5m&type=sum') self.assertEqual(rv.status_code, 200) self.assertEqual(json.loads(utils.text(rv.data)), {})