def test_a110_one(self): pid, fd = os.forkpty() #cmd = [sys.executable] cmd = ['coverage', 'run'] cmd += [ inspect.getsourcefile(run), 'one', '-i', inspect.getsourcefile(data_sample_handler) ] if pid == 0: # child os.execvp(cmd[0], cmd) else: # parent def wait_text(timeout=1): import select text = [] while True: rl, wl, xl = select.select([fd], [], [], timeout) if not rl: break try: t = os.read(fd, 1024) except OSError: break if not t: break t = utils.text(t) text.append(t) print(t, end='') return ''.join(text) text = wait_text() self.assertIn('new task data_sample_handler:on_start', text) self.assertIn('pyspider shell', text) os.write(fd, utils.utf8('run()\n')) text = wait_text() self.assertIn('task done data_sample_handler:on_start', text) os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin)) text = wait_text() self.assertIn('/robots.txt', text) os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin)) text = wait_text(2) self.assertIn('"title": "Links"', text) os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin)) text = wait_text() self.assertIn('task retry', text) os.write(fd, b'quit_pyspider()\n') text = wait_text() self.assertIn('scheduler exiting...', text) os.close(fd) os.kill(pid, signal.SIGINT)
def test_a110_one(self): pid, fd = os.forkpty() #cmd = [sys.executable] cmd = ['coverage', 'run'] cmd += [ inspect.getsourcefile(run), 'one', '-i', inspect.getsourcefile(data_sample_handler) ] if pid == 0: # child os.execvp(cmd[0], cmd) else: # parent def wait_text(timeout=1): import select text = [] while True: rl, wl, xl = select.select([fd], [], [], timeout) if not rl: break try: t = os.read(fd, 1024) except OSError: break if not t: break t = utils.text(t) text.append(t) print(t, end='') return ''.join(text) text = wait_text() self.assertIn('new task data_sample_handler:on_start', text) self.assertIn('pyspider shell', text) os.write(fd, utils.utf8('run()\n')) text = wait_text() self.assertIn('task done data_sample_handler:on_start', text) os.write(fd, utils.utf8('crawl("%s/pyspider/test.html")\n' % self.httpbin)) text = wait_text() self.assertIn('/robots.txt', text) os.write(fd, utils.utf8('crawl("%s/links/10/0")\n' % self.httpbin)) text = wait_text() self.assertIn('"title": "Links"', text) os.write(fd, utils.utf8('crawl("%s/404")\n' % self.httpbin)) text = wait_text() self.assertIn('task retry', text) os.write(fd, b'quit_pyspider()\n') text = wait_text() self.assertIn('scheduler exiting...', text) os.close(fd) os.kill(pid, signal.SIGINT)
def generator(): stringio = StringIO.StringIO() csv_writer = csv.writer(stringio) it = iter(resultdb.select(project)) first_30 = [] for result in it: first_30.append(result) if len(first_30) >= 30: break common_fields, _ = result_formater(first_30) common_fields_l = sorted(common_fields) csv_writer.writerow(["url"] + [utf8(x) for x in common_fields_l] + ["..."]) for result in itertools.chain(first_30, it): other = {} for k, v in result["result"].iteritems(): if k not in common_fields: other[k] = v csv_writer.writerow( [toString(result["url"])] + [toString(result["result"].get(k, "")) for k in common_fields_l] + [toString(other)] ) yield stringio.getvalue() stringio.truncate(0)
def generator(): stringio = StringIO.StringIO() csv_writer = csv.writer(stringio) it = iter(resultdb.select(project)) first_30 = [] for result in it: first_30.append(result) if len(first_30) >= 30: break common_fields, _ = result_formater(first_30) common_fields_l = sorted(common_fields) csv_writer.writerow(['url'] + [utf8(x) for x in common_fields_l] + ['...']) for result in itertools.chain(first_30, it): other = {} for k, v in result['result'].iteritems(): if k not in common_fields: other[k] = v csv_writer.writerow( [toString(result['url'])] + [toString(result['result'].get(k, '')) for k in common_fields_l] + [toString(other)] ) yield stringio.getvalue() stringio.truncate(0)
def _stringify(data): if 'result' in data: data['result'] = json.dumps(data['result']) if six.PY3: for key, value in list(six.iteritems(data)): if isinstance(value, six.string_types): data[key] = utils.utf8(value) return data
def getMemberNames(self): members = [] for project in self.projectdb.get_all(fields=["name"]): project_name = utf8(project["name"]) if not project_name.endswith(".py"): project_name += ".py" members.append(project_name) return members
def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = utf8(project["name"]) if not project_name.endswith(".py"): project_name += ".py" members.append(ScriptResource(os.path.join(self.path, project_name), self.environ, self.app, project)) return members
def getMemberNames(self): members = [] for project in self.projectdb.get_all(fields=['name', ]): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' members.append(utf8(project_name)) return members
def getMemberNames(self): members = [] for project in self.projectdb.get_all(fields=['name', ]): project_name = utf8(project['name']) if not project_name.endswith('.py'): project_name += '.py' members.append(project_name) return members
def _stringify(data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = json.dumps(data[each]) if six.PY3: for key, value in list(six.iteritems(data)): if isinstance(value, six.string_types): data[key] = utils.utf8(value) return data
def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = utf8(project['name']) if not project_name.endswith('.py'): project_name += '.py' members.append( ScriptResource(os.path.join(self.path, project_name), self.environ, self.app, project)) return members
def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' native_path = os.path.join(self.path, project_name) native_path = text(native_path) if six.PY3 else utf8(native_path) members.append( ScriptResource(native_path, self.environ, self.app, project)) return members
def getMemberList(self): members = [] for project in self.projectdb.get_all(): project_name = project['name'] if not project_name.endswith('.py'): project_name += '.py' native_path = os.path.join(self.path, project_name) native_path = text(native_path) if six.PY3 else utf8(native_path) members.append(ScriptResource( native_path, self.environ, self.app, project )) return members
def _stringify(data): if six.PY3: for key, value in list(six.iteritems(data)): if isinstance(value, six.string_types): data[key] = utils.utf8(value) return data
def puppeteer_fetch(self, url, task): '''Fetch with puppeteer proxy''' start_time = time.time() self.on_fetch('puppeteer', task) handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x) # check puppeteer proxy is enabled if not self.puppeteer_proxy: result = { "orig_url": url, "content": "puppeteer is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = { 'follow_redirects': False } request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header logger.info("%s", self.puppeteer_proxy) # making requests fetch['headers'] = dict(fetch['headers']) headers = {} headers['Content-Type'] = 'application/json; charset=UTF-8' try: request = tornado.httpclient.HTTPRequest( url=self.puppeteer_proxy, method="POST", headers=headers, body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return(handle_error(Exception('no response from puppeteer: %r' % response))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result)
def _stringify(data): for each in ('schedule', 'fetch', 'process', 'track'): if each in data: data[each] = utils.utf8(json.dumps(data[each])) return data
def _stringify(data): if 'result' in data: data['result'] = utils.utf8(json.dumps(data['result'])) return data
def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch( fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError( 403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future( self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese( urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result)
def puppeteer_fetch(self, url, task): '''Fetch with puppeteer proxy''' start_time = time.time() self.on_fetch('puppeteer', task) handle_error = lambda x: self.handle_error('puppeteer', url, task, start_time, x) # check puppeteer proxy is enabled if not self.puppeteer_proxy: result = { "orig_url": url, "content": "puppeteer is not enabled.", "headers": {}, "status_code": 501, "url": url, "time": time.time() - start_time, "cookies": {}, "save": task.get('fetch', {}).get('save') } logger.warning("[501] %s:%s %s 0s", task.get('project'), task.get('taskid'), url) raise gen.Return(result) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) for each in task_fetch: if each not in fetch: fetch[each] = task_fetch[each] # robots.txt if task_fetch.get('robots_txt', False): user_agent = fetch['headers']['User-Agent'] can_fetch = yield self.can_fetch(user_agent, url) if not can_fetch: error = tornado.httpclient.HTTPError( 403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) request_conf = {'follow_redirects': False} request_conf['connect_timeout'] = fetch.get('connect_timeout', 20) request_conf['request_timeout'] = fetch.get('request_timeout', 120) + 1 session = cookies.RequestsCookieJar() if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] request = tornado.httpclient.HTTPRequest(url=fetch['url']) cookie_header = cookies.get_cookie_header(session, request) if cookie_header: fetch['headers']['Cookie'] = cookie_header logger.info("%s", self.puppeteer_proxy) # making requests fetch['headers'] = dict(fetch['headers']) headers = {} headers['Content-Type'] = 'application/json; charset=UTF-8' try: request = tornado.httpclient.HTTPRequest(url=self.puppeteer_proxy, method="POST", headers=headers, body=json.dumps(fetch), **request_conf) except Exception as e: raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) if not response.body: raise gen.Return( handle_error( Exception('no response from puppeteer: %r' % response))) result = {} try: result = json.loads(utils.text(response.body)) assert 'status_code' in result, result except Exception as e: if response.error: result['error'] = utils.text(response.error) raise gen.Return(handle_error(e)) if result.get('status_code', 200): logger.info("[%d] %s:%s %s %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['time']) else: logger.error("[%d] %s:%s %s, %r %.2fs", result['status_code'], task.get('project'), task.get('taskid'), url, result['content'], result['time']) raise gen.Return(result)
def _stringify(data): for each in ("schedule", "fetch", "process", "track"): if each in data: data[each] = utils.utf8(json.dumps(data[each])) return data
def getContentLength(self): return len(utf8(self.project["script"]))
def getContent(self): return BytesIO(utf8(self.project['script']))
def getContentLength(self): return len(utf8(self.project['script']))
def http_fetch(self, url, task): '''HTTP fetcher''' start_time = time.time() self.on_fetch('http', task) handle_error = lambda x: self.handle_error('http', url, task, start_time, x) # setup request parameters fetch = self.pack_tornado_request_parameters(url, task) task_fetch = task.get('fetch', {}) session = cookies.RequestsCookieJar() # fix for tornado request obj if 'Cookie' in fetch['headers']: c = http_cookies.SimpleCookie() try: c.load(fetch['headers']['Cookie']) except AttributeError: c.load(utils.utf8(fetch['headers']['Cookie'])) for key in c: session.set(key, c[key]) del fetch['headers']['Cookie'] if 'cookies' in fetch: session.update(fetch['cookies']) del fetch['cookies'] max_redirects = task_fetch.get('max_redirects', 5) # we will handle redirects by hand to capture cookies fetch['follow_redirects'] = False # making requests while True: # robots.txt if task_fetch.get('robots_txt', False): can_fetch = yield self.can_fetch(fetch['headers']['User-Agent'], fetch['url']) if not can_fetch: error = tornado.httpclient.HTTPError(403, 'Disallowed by robots.txt') raise gen.Return(handle_error(error)) try: request = tornado.httpclient.HTTPRequest(**fetch) # if cookie already in header, get_cookie_header wouldn't work old_cookie_header = request.headers.get('Cookie') if old_cookie_header: del request.headers['Cookie'] cookie_header = cookies.get_cookie_header(session, request) if cookie_header: request.headers['Cookie'] = cookie_header elif old_cookie_header: request.headers['Cookie'] = old_cookie_header except Exception as e: logger.exception(fetch) raise gen.Return(handle_error(e)) try: response = yield gen.maybe_future(self.http_client.fetch(request)) except tornado.httpclient.HTTPError as e: if e.response: response = e.response else: raise gen.Return(handle_error(e)) extract_cookies_to_jar(session, response.request, response.headers) if (response.code in (301, 302, 303, 307) and response.headers.get('Location') and task_fetch.get('allow_redirects', True)): if max_redirects <= 0: error = tornado.httpclient.HTTPError( 599, 'Maximum (%d) redirects followed' % task_fetch.get('max_redirects', 5), response) raise gen.Return(handle_error(error)) if response.code in (302, 303): fetch['method'] = 'GET' if 'body' in fetch: del fetch['body'] fetch['url'] = quote_chinese(urljoin(fetch['url'], response.headers['Location'])) fetch['request_timeout'] -= time.time() - start_time if fetch['request_timeout'] < 0: fetch['request_timeout'] = 0.1 max_redirects -= 1 continue result = {} result['orig_url'] = url result['content'] = response.body or '' result['headers'] = dict(response.headers) result['status_code'] = response.code result['url'] = response.effective_url or url result['time'] = time.time() - start_time result['cookies'] = session.get_dict() result['save'] = task_fetch.get('save') if response.error: result['error'] = utils.text(response.error) if 200 <= response.code < 300: logger.info("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) else: logger.warning("[%d] %s:%s %s %.2fs", response.code, task.get('project'), task.get('taskid'), url, result['time']) raise gen.Return(result)
def getContent(self): return BytesIO(utf8(self.project["script"]))