def crawl_info(self, data, payload, begin_time): end_time = datetime.datetime.now() elapsed = (end_time - begin_time).total_seconds() tid = payload.get('id') url = payload.get('url') if type(data) == dict: data['time'] = elapsed if tid: data['id'] = tid data['spider'] = 'chrome' data['url'] = url data['payload'] = payload # data['chrome_id'] = payload.get('chrome_id') else: data = { 'time': elapsed, 'spider': 'chrome', 'url': url, } post_func = payload.get('post_func') if type(post_func) == str: post_func = func.load(post_func) if post_func: data2 = post_func(payload, data) if type(data2) is dict and len(data2) >= len(data): data = data2 return data
async def get_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_get(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) charset = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: charset = match.group(1) print('Decoding using %s' % charset) body = data_buf.getvalue() if len(body) == 0: data = '' charset = 'utf-8' else: if charset is None: dammit = UnicodeDammit( body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup charset = dammit.original_encoding else: data = body.decode(charset, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup_lxml = BeautifulSoup(data, 'lxml') soup_html = BeautifulSoup(data, 'html.parser') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': get_title(soup_lxml), 'links': get_links(soup_lxml), 'links2': get_links2(soup_lxml), 'metas': get_metas(soup_lxml), 'images': get_images(soup_lxml), 'scripts': get_scripts(soup_lxml), 'text': get_text(soup_html), 'data': data, 'headers': headers, 'charset': charset, 'spider': 'pycurl', 'payload': payload, }) post_func = payload.get('post_func') if post_func: post_func = load(post_func) resp = post_func(payload, resp) return resp finally: c.close()
def __exit__(self, e_type, e_value, e_trace): if e_type is None or e_value is None: return to = self.__dict__.get('to') if to: exceptions = self.__dict__.get('exceptions', ()) if issubclass(e_type, exceptions): propagate = func.load(to)(e_type, e_value, e_trace) if propagate in [True, False]: return propagate
def __exit__(self, e_type, e_value, e_trace): on_error = self.__dict__.get('on_error') if on_error: propagate = func.load(on_error)(e_type, e_value, e_trace) if propagate in [True, False]: return propagate for supp in self.__dict__.get('suppress_list', []): if isinstance(e_value, supp): return True return False
async def get_request(payload): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_get(c, payload, data_buf, headers) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) encoding = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) print('Decoding using %s' % encoding) body = data_buf.getvalue() if len(body) == 0: data = '' encoding = 'utf-8' else: if encoding is None: dammit = UnicodeDammit( body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup encoding = dammit.original_encoding else: data = body.decode(encoding, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup = BeautifulSoup(data, 'lxml') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': str(soup.title.get_text()), 'links': [str(link) for link in soup.find_all('a', href=True)], 'data': data, 'headers': headers, 'encoding': encoding, }) post_func = payload.get('post_func') if post_func: post_func = load(post_func) resp = post_func(payload, resp) return resp finally: c.close()
def load(id, args, error_handler=None): if args and error_handler: return func.load(id).s(args).on_error(func.load(error_handler).s()) if args and not error_handler: return func.load(id).s(args) if not args and error_handler: return func.load(id).s().on_error(func.load(error_handler).s()) return func.load(id).s()
async def post_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_post(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) # encoding = None # if 'content-type' in headers: # content_type = headers['content-type'].lower() # match = re.search('charset=(\S+)', content_type) # if match: # encoding = match.group(1) # print('Decoding using %s' % encoding) body = data_buf.getvalue() encoding = 'utf-8' data = body.decode(encoding, 'ignore') if len(body) > 0 else '' # if encoding is None: # dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") # data = dammit.unicode_markup # encoding = dammit.original_encoding # else: # data = body.decode(encoding, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] resp.update({ # 'url': payload.get('url'), 'data': data, 'headers': headers, 'encoding': encoding, }) post_func = payload.get('post_func') if type(post_func) == str: post_func = load(post_func) if post_func: resp = post_func(payload, resp) # post_func = payload.get('post_func') # if post_func: # post_func = load(post_func) # resp = post_func(payload, resp) return resp finally: c.close()
def loads(payload): if payload.get('type') != 'normal': raise Exception('celery task loader only support normal mode') tasks = payload.get('tasks', []) cts = [] for task in tasks: ops = [ load(id, task.get('args'), task.get('on_error')) if i == 0 else load(id, None, task.get('on_error')) for i, id in enumerate(task['ids']) ] cts.append(chain(ops)) callback = payload.get('callback') if callback: return chord(header=group(cts), body=func.load(callback).s()) return group(cts)
def load_handler(self, name): if name is None: return None return load(name)
def setup_curl_basic(c, p, data_buf, headers=None): def header_function(header_line): count = headers['count'] header_line = header_line.decode('iso-8859-1') if ':' not in header_line and not header_line.startswith('HTTP'): # print(header_line) if '\r\n' in header_line: headers['count'] += 1 headers['content'].append({}) return # Break the header line into header name and value. if ':' in header_line: name, value = header_line.rstrip('\r\n').split(':', 1) else: name, value = header_line.rstrip('\r\n').split(' ', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. if name in headers['content'][count]: headers['content'][count][name].append(value) else: headers['content'][count][name] = [value] def write_function(buf): size = data_buf.getbuffer().nbytes if size < 4096000: data_buf.write(buf) return len(buf) return 0 url = p.get('url') c._raw_url = url c._raw_id = p.get('id', str(uuid.uuid1())) c._raw_post_func = p.get('post_func') c._raw_payload = p c.setopt(pycurl.URL, url.encode('utf-8')) c.setopt(pycurl.FOLLOWLOCATION, p.get('followlocation', 1)) c.setopt(pycurl.MAXREDIRS, p.get('maxredirs', 5)) # c.setopt(pycurl.WRITEHEADER, header_buf) headerfunction = p.get('headerfunction') if headerfunction is None: c.setopt(pycurl.HEADERFUNCTION, header_function) else: c.setopt(pycurl.HEADERFUNCTION, load(headerfunction)) writefunction = p.get('writefunction') if writefunction is None: c.setopt(pycurl.WRITEFUNCTION, write_function) else: c.setopt(pycurl.WRITEFUNCTION, load(writefunction)) c.setopt(pycurl.USERAGENT, p.get('useragent', DEFAULT_USER_AGENT)) c.setopt(pycurl.SSL_VERIFYPEER, p.get('ssl_verifypeer', 0)) c.setopt(pycurl.SSL_VERIFYHOST, p.get('ssl_verifyhost', 0)) c.setopt(pycurl.NOSIGNAL, p.get('nosignal', 1)) c.setopt(pycurl.CONNECTTIMEOUT, p.get('connecttimeout', 7)) c.setopt(pycurl.TIMEOUT, p.get('timeout', 15)) c.setopt(pycurl.DNS_CACHE_TIMEOUT, p.get('dns_cache_timeout', 360)) c.setopt(pycurl.DNS_USE_GLOBAL_CACHE, p.get('dns_use_global_cache', 1)) c.setopt(pycurl.TCP_NODELAY, p.get('tcp_nodelay', 1)) c.setopt(pycurl.IPRESOLVE, p.get('ipresolve', pycurl.IPRESOLVE_V4)) c.setopt(pycurl.ENCODING, p.get('encoding', 'gzip, deflate')) c.setopt(pycurl.HTTP_VERSION, p.get('http_version', pycurl.CURL_HTTP_VERSION_1_0)) c.setopt(pycurl.FORBID_REUSE, p.get('forbid_reuse', 1)) c.setopt(pycurl.FRESH_CONNECT, p.get('fresh_connect', 1)) c.setopt(c.AUTOREFERER, p.get('autoreferer', 1)) referer = p.get('referer') if referer: c.setopt(c.REFERER, referer) cookiejar = p.get('cookiejar') if cookiejar: print('cookiejar', cookiejar) c.setopt(c.COOKIEJAR, cookiejar) cookiefile = p.get('cookiefile') if cookiefile: print('cookiefile', cookiefile) c.setopt(c.COOKIEFILE, cookiefile) dns_servers = p.get('dns_servers') if dns_servers: c.setopt(c.DNS_SERVERS, dns_servers) debug = p.get('debugfunction') if debug: c.setopt(pycurl.DEBUGFUNCTION, load(debug)) c.setopt(pycurl.VERBOSE, p.get('verbose', 0)) proxy = p.get('proxy') proxyport = p.get('proxyport') proxytype = p.get('proxytype') proxyuserpwd = p.get('proxyuserpwd') if proxy and proxyport and proxytype: c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYPORT, proxyport) c.setopt(pycurl.PROXYTYPE, proxytype) if proxyuserpwd: c.setopt(pycurl.PROXYUSERPWD, proxyuserpwd)