def anaylse_it(self, responses): payload = self.info analysers = payload.get('analysers') if analysers: for analyser in analysers: payload['analyser'] = analyser sig = { # here the payload is the info 'payload': payload, 'response': responses, } try: if payload.get('mode', 'celery') == 'celery': App().app.send_task( 'netboy.celery.tasks.analyser_task', kwargs=sig, countdown=1, queue=self.info.get('queue', 'worker'), routing_key=self.info.get('queue', 'worker')) else: analyser_func = load(analyser) if isinstance( analyser, str) else load(analyser.get('analyser')) resp = analyser_func(responses) if resp is not None: responses = resp except Exception as e: self.log.critical('analyser failed: ' + str(e)) return responses
def analyser_task(payload, response): result = {} analyser = payload.get('analyser') if isinstance(analyser, str): analyse_it = load(analyser) analyse_result = analyse_it(payload, response) result['analyse'] = analyse_result else: if 'analyser' in analyser: analyse_it = load(analyser.get('analyser')) analyse_result = analyse_it(payload, response) result['analyser'] = analyse_result result['payload'] = payload result['response'] = response return result
def trigger_it(self, payload, response): triggers = payload.pop('triggers', None) if triggers: for trigger in triggers: # if url: # pay['job_id'] = payload.get('job_id') # pay['job_name'] = payload.get('job_name') # pay['task_id'] = payload.get('task_id') # pay['task_name'] = payload.get('task_name') # pay['url'] = payload.get('url') if isinstance(trigger, str): trigger = {'trigger': trigger} payload['trigger'] = trigger sig = { 'payload': payload, 'response': response, } try: if trigger.get('mode', 'sync') == 'celery' and payload.get('mode') == 'celery': App().app.send_task('netboy.celery.tasks.trigger_task', kwargs=sig, countdown=1, queue=payload.get('queue'), routing_key=payload.get('queue')) else: trigger_func = load(trigger) if isinstance(trigger, str) else load(trigger.get('trigger')) resp = trigger_func(payload, response) if resp: if resp.get('update'): response = resp except Exception as e: self.log.critical('trigger failed: ' + str(e)) if isinstance(response, dict): if response.get('update'): response['update'] = False return response
def trigger_task(payload, response): result = {} trigger = payload.get('trigger') if isinstance(trigger, str): trigger_it = load(trigger) trigger_result = trigger_it(payload, response) result['trigger'] = trigger_result else: if 'trigger' in trigger: trigger_it = load(trigger.get('trigger')) trigger_result = trigger_it(payload, response) result['trigger'] = trigger_result response.pop('data', None) result['payload'] = payload result['response'] = response return result
def final_callback(data, info): final = info.get('final') if final: final_func = load(final) if final_func: return final_func(data, info) return
async def run_core_async(self, spider, func, extra=None): func = load(func) if isinstance(func, str) else func responses = [] print(self.updated, '.....') for d in self.updated: prepare_resp = self.prepare_it(d) url = d.get('url') if isinstance(d, dict) else d if isinstance(prepare_resp, dict): if prepare_resp.get('skip'): continue if prepare_resp.get('cover'): response = self.trigger_it(d, prepare_resp) responses.append(response) continue start = time.time() try: response = await func(d, extra) if response is None: end = time.time() response_time = '%s' % (end - start) msg = "failed! url: " + str(url) self.log.warning(msg) response = { 'url': url, 'effect': url, 'data': '', 'title': '', 'spider': spider, 'state': 'error', "code": -2, "time": response_time } else: end = time.time() d['time'] = '%s' % (end - start) msg = "success! url: " + str(url) self.log.info(msg) except Exception as e: end = time.time() response_time = '%s' % (end - start) msg = "failed! url: " + str(url) + ' errtype: ' + str(type(e)) + ' errmsg: ' + str(e) self.log.warning(msg) response = { 'url': url, 'effect': url, 'data': '', 'title': '', 'spider': spider, 'state': 'error', "code": -1, "time": response_time } response = self.trigger_it(d, response) responses.append(response) self.anaylse_it(responses) return responses
def prepare_it(self, data): prepares = data.pop('prepares', None) if prepares: for prepare in prepares: data['prepare'] = prepare try: prepare_func = load(prepare) if isinstance(prepare, str) else load(prepare.get('prepare')) resp = prepare_func(data) if resp is not None: if resp.get('update'): data = resp except Exception as e: self.log.critical('prepare failed: ' + str(e) + ' type: ' + str(type(e))) if isinstance(data, dict): if data.get('update'): data['update'] = False return data
def setup_curl(c): d = c.data databuf = BytesIO() headers = {'count': 0, 'content': [{}]} set_cookies = [] c.databuf = databuf c.headers = headers def header_function(header_line): match = re.match("^Set-Cookie: (.*)$", header_line.decode('utf8', 'ignore')) if match: set_cookies.append(match.group(1)) count = headers['count'] header_line = header_line.decode('iso-8859-1') if ':' not in header_line and not header_line.startswith('HTTP'): # print(header_line) if '\r\n' in header_line: headers['count'] += 1 headers['content'].append({}) return # Break the header line into header name and value. if ':' in header_line: name, value = header_line.rstrip('\r\n').split(':', 1) else: name, value = header_line.rstrip('\r\n').split(' ', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. if name in headers['content'][count]: headers['content'][count][name].append(value) else: headers['content'][count][name] = [value] def write_function(buf): size = databuf.getbuffer().nbytes if size < 4096000: databuf.write(buf) return len(buf) return 0 c.setopt(pycurl.FOLLOWLOCATION, d.get('followlocation', 1)) c.setopt(pycurl.MAXREDIRS, d.get('maxredirs', 5)) c.setopt(pycurl.CONNECTTIMEOUT, d.get('connecttimeout', 10)) c.setopt(pycurl.TIMEOUT, d.get('timeout', 20)) c.setopt(pycurl.NOSIGNAL, d.get('nosignal', 1)) c.setopt(pycurl.USERAGENT, d.get('useragent', DEFAULT_USER_AGENT)) c.setopt(pycurl.SSL_VERIFYPEER, d.get('ssl_verifypeer', 0)) c.setopt(pycurl.SSL_VERIFYHOST, d.get('ssl_verifyhost', 0)) crawl_url = d.get('effect') or d.get('url') if not crawl_url.startswith('http'): crawl_url = 'http://' + crawl_url c.setopt(pycurl.URL, crawl_url.encode('utf-8')) # c.setopt(pycurl.URL, url) headerfunction = d.get('headerfunction') if headerfunction is None: c.setopt(pycurl.HEADERFUNCTION, header_function) else: c.setopt(pycurl.HEADERFUNCTION, load(headerfunction)) writefunction = d.get('writefunction') if writefunction is None: c.setopt(pycurl.WRITEFUNCTION, write_function) else: c.setopt(pycurl.WRITEFUNCTION, load(writefunction)) method = d.get('method', 'get') if method == 'get': httpheader = d.get('httpheader') if httpheader: c.setopt(c.HTTPHEADER, httpheader) elif method == 'post': httpheader = d.get('httpheader', ['Accept: application/json', "Content-type: application/json"]) if httpheader: # curl.setopt(pycurl.HEADER, p.get('header', 1)) c.setopt(pycurl.HTTPHEADER, httpheader) post301 = getattr(pycurl, 'POST301', None) if post301 is not None: # Added in libcurl 7.17.1. c.setopt(post301, True) c.setopt(pycurl.POST, 1) postfields = d.get('postfields') if postfields: postfields = json.dumps(postfields) c.setopt(pycurl.POSTFIELDS, postfields) elif method == 'postform': httpheader = d.get('httpheader', ["Content-Type: application/x-www-form-urlencoded"]) if httpheader: c.setopt(pycurl.HTTPHEADER, httpheader) post301 = getattr(pycurl, 'POST301', None) if post301 is not None: # Added in libcurl 7.17.1. c.setopt(post301, True) c.setopt(pycurl.POST, 1) httppost = d.get('postform') if httppost: c.setopt(pycurl.POSTFIELDS, httppost) proxy_type = d.get('proxytype') proxy = d.get('proxy') proxy_port = d.get('proxyport') proxy_userpwd = d.get('proxyuserpwd') if proxy: c.setopt(pycurl.PROXY, proxy) if proxy_port: c.setopt(pycurl.PROXYPORT, proxy_port) if proxy_userpwd: c.setopt(pycurl.PROXYUSERPWD, proxy_userpwd) if proxy_type: if '4' in proxy_type: proxy_type = pycurl.PROXYTYPE_SOCKS4A elif '5' in proxy_type: proxy_type = pycurl.PROXYTYPE_SOCKS5_HOSTNAME else: proxy_type = pycurl.PROXYTYPE_HTTP c.setopt(pycurl.PROXYTYPE, proxy_type) verbose = d.get('verbose') if verbose: c.setopt(pycurl.VERBOSE, True) # with the line below, redirect cookie can update c.setopt(pycurl.COOKIEFILE, "") cookie = d.get('cookie') if cookie: c.setopt(pycurl.COOKIE, cookie) c.setopt(pycurl.FAILONERROR, True) http_version = d.get('http_version') if http_version == '1.1' or http_version == 1.1: c.setopt(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_1) else: c.setopt(pycurl.HTTP_VERSION, pycurl.CURL_HTTP_VERSION_1_0)
def run(self): load_timeout = self.info.get('timeout', 15) script_timeout = self.info.get('script_timeout', 15) implicit_wait = self.info.get('wait', 5) if not self.driver: chrome_bin = self.info.get('chrome', '/opt/google/chrome-beta/chrome') # window_size = self.info.get('window_size', '2048x4096') proxy_type = self.info.get('proxytype') user_agent = self.info.get('useragent', DEFAULT_USER_AGENT) lang = self.info.get('lang', 'zh,zh-CN') if proxy_type == '5': proxy_type = 'socks5h' if proxy_type == '4': proxy_type = 'socks4' proxy = self.info.get('proxy') proxy_port = self.info.get('proxyport') options = webdriver.ChromeOptions() options.binary_location = chrome_bin # options.add_argument('headless') options.set_headless(headless=True) options.add_argument("--dns-prefetch-disable") options.add_argument('--no-referrers') # options.add_argument('window-size=' + window_size) # options.add_argument('--proxy-server=http://127.0.0.1:8123') # options.add_argument('--proxy-server=https://127.0.0.1:8123') # options.add_argument('--proxy-server=socks5://127.0.0.1:1082') if proxy and proxy_type and proxy_port: options.add_argument('--proxy-server=%s://%s:%d' % (proxy_type, proxy, proxy_port)) if user_agent: options.add_argument("--user-agent=" + user_agent) options.add_argument('--disable-gpu') options.add_argument('--disable-audio') options.add_argument('--no-sandbox') options.add_argument('--ignore-certificate-errors') options.add_argument('--allow-insecure-localhost') if lang: options.add_experimental_option( 'prefs', {'intl.accept_languages': lang}) # options.add_argument('lang='+lang) capabilities = DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True try: self.driver = webdriver.Chrome( chrome_options=options, desired_capabilities=capabilities) except Exception as e: self.log.warning('error: ' + str(e) + ' error_type: ' + str(type(e))) raise e self.driver.implicitly_wait(implicit_wait) self.driver.set_page_load_timeout(load_timeout) self.driver.set_script_timeout(script_timeout) responses = [] try: # crawl_func = exit_after(load_timeout)(crawl) for d in self.updated: prepare_resp = self.prepare_it(d) if isinstance(prepare_resp, dict): if prepare_resp.get('skip'): continue if prepare_resp.get('cover'): response = self.trigger_it(d, prepare_resp) if self.info.get('mode') == 'celery': response.pop('data', None) response.pop('screen', None) responses.append(response) continue start = time.time() url = d.get('url') if isinstance(d, dict) else d try: response = crawl(self.driver, d) if response is None: end = time.time() response_time = '%s' % (end - start) msg = "failed! url: " + str(url) self.log.warning(msg) response = { 'url': url, 'effect': url, 'data': '', 'title': '', 'spider': 'chrome', 'state': 'error', "code": -2, "time": response_time } else: interact = d.get('interactive') if interact: inter_func = load(interact) inter_func(d, self.driver) end = time.time() d['time'] = '%s' % (end - start) msg = "success! url: " + str(url) + ' effect: ' + str( self.driver.current_url) self.log.info(msg) except Exception as e: end = time.time() response_time = '%s' % (end - start) msg = "failed! url: " + str(url) + ' errtype: ' + str( type(e)) + ' errmsg: ' + str(e) self.log.warning(msg) response = { 'url': url, 'effect': url, 'data': '', 'title': '', 'spider': 'chrome', 'state': 'error', "code": -1, "time": response_time } response = self.trigger_it(d, response) if self.info.get('mode') == 'celery': response.pop('data', None) response.pop('screen', None) responses.append(response) self.anaylse_it(responses) finally: if self.driver: self.driver.quit() self.driver = None return responses
async def fetch(self, data, session, json_response=False): if isinstance(data, str): data = { 'url': data, } url = data.get('url') filter = data.get('filter') if not filter: filter = ['data', 'title', 'code'] results = {'url': url} async with session.get(url) as response: # delay = response.headers.get("DELAY") # date = response.headers.get("DATE") # print("{}:{} with delay {}".format(date, response.url, delay)) # content = await response.read() stream = data.get('stream') if isinstance(stream, dict): stream_func = stream.get('func') stream_func = load(stream_func) stream_chunk = stream.get('chunk', 512) stream_file = stream.get('file') count = 0 if stream_file: with open(stream_file, 'wb') as fd: while True: chunk = await response.content.read(stream_chunk) count += 1 if not chunk: break stream_func(chunk, data, fd) else: while True: chunk = await resp.content.read(stream_chunk) count += 1 if not chunk: break stream_func(chunk, data) results['stream'] = {'chunk': stream_chunk, 'count': count} elif 'data' in filter or 'charset' in filter or 'title' in filter: charset = data.get('charset') if json_response is False: if charset: raw = await response.read() content = raw.decode('utf8', errors='ignore') else: content = await response.text() else: content = await response.json() match = re.search('<title[^>]*>([^<]+)</title>', content, re.IGNORECASE) title = match.group(1) if match else '' if 'data' in filter: results['data'] = content if 'charset' in filter: results['charset'] = charset if 'title' in filter: results['title'] = title if 'headers' in filter: results['headers'] = { k: v for k, v in response.headers.items() } if 'cookies' in filter: results['cookies'] = { k: v for k, v in response.cookies.items() } if 'effect' in filter: results['effect'] = response.real_url if 'code' in filter: results['code'] = response.status if 'method' in filter: results['method'] = response.method return results