def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')): '''格式化UrlItem''' try: response = {} response['nettime'] = requestInfo['nettime'] if requestInfo['error'] or requestInfo['http_code'] != 200: response['status'] = 3 response['http_code'] = requestInfo['http_code'] response['error'] = repr(requestInfo['error']) response['end_at'] = now_format() response['depth'] = urlInfo['depth'] else: response['status'] = 2 response['end_at'] = now_format() response['depth'] = urlInfo['depth'] response['md5_url'] = md5(urlInfo['url']) response['md5_body'] = md5(requestInfo['body']) response['redirects'] = json.dumps(requestInfo['redirects'], ensure_ascii=False) response['http_code'] = requestInfo['http_code'] response['request_headers'] = json.dumps(requestInfo['request_headers'], ensure_ascii=False) response['response_headers'] = json.dumps(requestInfo['response_headers'], ensure_ascii=False) response['file_name'] = fileInfo[0] response['file_path'] = fileInfo[1] return response except Exception as e: logger.exception(e) return False
def crawljs(taskInfo): try: #已抓取过,不再抓取 if taskInfo['status'] not in (0, 1): return True #抓取页面源代码 requestInfo = spiderRequest(taskInfo['url']) parseResults = [] results = _parseForJs(taskInfo['url']) for record in results: urlRow = _parseForUrl(record) if not urlRow: continue parseResults.append(urlRow) updateRow = {} updateRow['id'] = taskInfo['id'] updateRow['http_code'] = requestInfo['http_code'] updateRow['response_headers'] = json.dumps( requestInfo['response_headers'], ensure_ascii=False) updateRow['body'] = requestInfo['body'] updateRow['md5_body'] = md5(requestInfo['body']) updateRow['parse_result'] = json.dumps(parseResults, ensure_ascii=False) updateRow['status'] = 2 updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S') #保存数据结果 mg_spiderjsurl_save(updateRow) except Exception as e: logger.exception(e) return False
def parse_browser(requestInfo): '''通过浏览器方式解析''' try: urls = [] results = parseHref2(currentUrl, ['a', 'link']) if 'links' in results.keys() and results['links']: urls.extend(_formatUrls(results['links'])) if 'ajaxs' in results.keys() and results['ajaxs']: urls.extend(_formatUrls(results['ajaxs'])) if 'results' in results.keys() and results['results']: urls.extend(_formatUrls(results['results'])) results = parseForms2(currentUrl) if 'links' in results.keys() and results['links']: urls.extend(_formatUrls(results['links'])) if 'ajaxs' in results.keys() and results['ajaxs']: urls.extend(_formatUrls(results['ajaxs'])) if 'results' in results.keys() and results['results']: urls.extend(_formatUrls(results['results'])) #results = parseMouseEvent(currentUrl) #if 'links' in results.keys() and results['links']: # urls.extend(_formatUrls(results['links'])) #if 'ajaxs' in results.keys() and results['ajaxs']: # urls.extend(_formatUrls(results['ajaxs'])) #if 'results' in results.keys() and results['results']: # urls.extend(_formatUrls(results['results'])) return urls except Exception as e: logger.exception(e) return False
def download(requestInfo, urlInfo, execute, fileType = 'html'): '''下载文件''' try: #如果有异常,直接返回 md5Body = md5(requestInfo['body']) result = mgdb.static_get(execute['domain'], md5Body) if result: return (result['file_name'], result['file_key']) localfile = '%s/%s/%s.tmp' % (PATH_TMP_UPLOAD, execute['domain'], md5Body) if not exists(dirname(localfile)): mkdirs(dirname(localfile)) if fileType == 'html': filename = "%s_%s.html" % (execute['id'], urlInfo['id']) filekey = 'html/%s/%s/%s_%s.html.%s' % (execute['domain'], execute['task_id'], execute['id'], urlInfo['id'], md5Body) fwriteBin(localfile, requestInfo['body']) fileType = 'html' else: filename = basename(requestInfo['url']) filekey = "static/%s/%s.%s" % (execute['domain'], requestInfo['url'][7:], md5Body) fwriteBin(localfile, requestInfo['body']) fileType = 'img' filepath = ydfs_upload(filekey, localfile) mgdb.c_insert('static', _formatStatic(execute['domain'], requestInfo['url'], filename, filekey, fileType, md5Body)) return (filename, filekey) except Exception as e: logger.exception(e) return ('', '')
def codeJavascript(func): try: content = read(TEMPLATE_JAVASCRIPT) pattern = re.compile(r'//--' + func + '--\n(.*?)\n//--' + func + '--', re.I | re.M | re.S) return pattern.findall(content)[0] except Exception as e: logger.exception(e) return False
def codeCasper(func, params=None): try: content = read(TEMPLATE_CASPER) pattern = re.compile(r'//--' + func + '--\n(.*?)\n//--' + func + '--', re.I | re.M | re.S) content = pattern.findall(content)[0] if params: for (k, v) in params.items(): content = content.replace('###' + k + '###', v) return content except Exception as e: logger.exception(e) return False
def parse_darklink(url = None): '''解析暗链''' try: cmd = 'phantomjs %s/safe_darklink.js %s' % (PATH_NODEJS, url) child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT) output = child.stdout.read().decode().strip() # logger.info('parse_darklink::::%s::::%s' % (url, output)) if output != 'fail': return json.loads(output) # logger.info('parse_darklink fail::::%s' % cmd) return False except Exception as e: logger.exception(e) return False
def parseForms(url=None): try: js_parseForms = codeJavascript('parseForms') js_formatRelativeUrl = codeJavascript('formatRelativeUrl') casper_create = codeCasper('casper_create') casper_parseform = codeCasper('casper_parseform') casper_run = codeCasper('casper_run') casper_start = codeCasper('casper_start', {'startUrl': '%s' % url}) content = "%s\n%s\n%s\n%s\n%s\n%s" % ( js_parseForms, js_formatRelativeUrl, casper_create, casper_start, casper_parseform, casper_run) output = execCasper(content) return json.loads(output) except Exception as e: logger.exception(e) return False
def execCasper(content=None): try: filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'), md5(content)) write(filename, content) cmd = 'casperjs ' + filename child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT) output = child.stdout.read().decode() #remove(filename) return output except Exception as e: logger.exception(e) return False
def parseHref(url=None, tags=None): try: jsFunc = 'parseHrefByTags' js_parseHrefByTags = codeJavascript('parseHrefByTags') casper_create = codeCasper('casper_create') casper_then = codeCasper('casper_then', { 'jsfunc': jsFunc, 'params': "['a', 'link']" }) casper_run = codeCasper('casper_run') casper_start = codeCasper('casper_start', {'startUrl': '%s' % url}) content = "%s\n%s\n%s\n%s\n%s" % (js_parseHrefByTags, casper_create, casper_start, casper_then, casper_run) output = execCasper(content) return json.loads(output) except Exception as e: logger.exception(e) return False
def parseMouseEvent(url=None): '''解析鼠标事件 鼠标事件包括: 'onclick', 'ondbclick', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup' ''' try: codes = [] tags = [ 'a', 'div', 'span', 'table', 'tr', 'td', 'th', 'button', 'input' ] events = [ 'onclick', 'ondbclick', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup' ] rows = captureEvent(url, tags, events) eventCodes = [] for index, row in enumerate(rows): # eventRow 示例: #{'tag':'div', 'eventName': 'onclick', 'index': 1} casper_mouse_event = codeCasper( 'casper_event', { 'tag': row['tag'], 'index': row['index'], 'eventName': row['event'] }) eventCodes.append(casper_mouse_event) js_execEvent = codeJavascript('execEvent') casper_create = codeCasper('casper_create') casper_wait = codeCasper('casper_wait') casper_start = codeCasper('casper_start', {'startUrl': '%s' % url}) casper_output = codeCasper('casper_output') casper_run = codeCasper('casper_run') content = "%s\n%s\n%s\n%s\n%s\n%s\n%s" % ( js_execEvent, casper_create, casper_start, casper_wait, "\n".join(eventCodes), casper_output, casper_run) output = execCasper(content) return json.loads(output) except Exception as e: logger.exception(e) return False
def parse_reg(requestInfo): try: # 内容为html, 检测编码,并进行解码 try: charset = chardet.detect(requestInfo['body'])['encoding'] charset = formatCharset(charset) body = requestInfo['body'].decode(charset) except Exception as e: charset = 'GBK' if charset == 'GB2312' else 'GBK' body = requestInfo['body'].decode(charset) logger.exception(e) #使用正则解析 urls = [] currentUrl = requestInfo['redirects'][-1]['url'] if requestInfo['redirects'] else requestInfo['url'] urls.extend(_formatUrls(parseUrlByMatchQuotes(currentUrl, body))) #双引号及单引号中间的链接 urls.extend(_formatUrls(parseHref(currentUrl, body))) #解析href urls.extend(_formatUrls(parseSrc(currentUrl, body))) #解析src img script frame iframe urls.extend(_formatUrls(parseUrlFromJs(currentUrl, body))) #从JS中解析URL return urls except Exception as e: logger.exception(e) return []
def captureEvent(url=None, tags=None, events=['onclick']): try: jsFunc = 'parseEventByTags' js_parseEventByTags = codeJavascript(jsFunc) casper_create = codeCasper('casper_create') casper_then = codeCasper( 'casper_then', { 'jsfunc': jsFunc, 'params': "%s, %s" % (json.dumps(tags, ensure_ascii=False), json.dumps(events, ensure_ascii=False)) }) casper_run = codeCasper('casper_run') casper_start = codeCasper('casper_start', {'startUrl': '%s' % url}) content = "%s\n%s\n%s\n%s\n%s" % (js_parseEventByTags, casper_create, casper_start, casper_then, casper_run) output = execCasper(content) jsonData = json.loads(output) return jsonData['results'] except Exception as e: logger.exception(e) return False
def spiderRequest(url=None, method="GET", data={}, headers={}, timeout=10, auth={}, proxy={}): headers['Cache-Control'] = 'no-chache' method = method.upper() start = time() try: #跳转记录 redirect_handler = RedirectHandler() #basic验证 auth_handler = HTTPBasicAuthHandler() if auth and 'user' in auth.keys() and 'passwd' in auth.keys(): passwdHandler = HTTPPasswordMgrWithDefaultRealm() passwdHandler.add_password(realm=None, uri=url, user=auth['user'], passwd=auth['passwd']) auth_handler = HTTPBasicAuthHandler(passwdHandler) #代理 proxy_handler = ProxyHandler() if proxy and 'url' in proxy.keys(): proxy_handler = ProxyHandler({'http': proxy['url']}) #代理验证 proxy_auth_handler = ProxyBasicAuthHandler() if proxy and 'url' in proxy.keys() and 'user' in proxy.keys( ) and 'passwd' in proxy.keys(): proxyPasswdHandler = HTTPPasswordMgrWithDefaultRealm() proxyPasswdHandler.add_password(realm=None, uri=proxy['url'], user=proxy['user'], passwd=proxy['passwd']) proxy_auth_handler = ProxyBasicAuthHandler(proxyPasswdHandler) opener = build_opener(redirect_handler, auth_handler, proxy_handler, proxy_auth_handler) request_handler = Request(quote(url, safe=string.printable), method=method) for key, value in headers.items(): request_handler.add_header(key, value) response = opener.open(request_handler, timeout=timeout) end = time() return { 'url': url, 'method': method, 'request_headers': request_handler.headers, 'response_headers': formatHeaders(response.getheaders()), 'http_code': response.status, 'redirects': redirect_handler.redirects, 'body': response.read(), 'nettime': end - start, 'error': '' } except HTTPError as e: # 400 401 402 403 500 501 502 503 504 logger.error(url + "::::::::" + repr(e)) end = time() return { 'url': url, 'method': method, 'request_headers': headers, 'response_headers': dict(e.headers), 'http_code': e.code, 'redirects': [], 'body': b'', 'nettime': end - start, 'error': repr(e) } except URLError as e: logger.error(url + "::::::::" + repr(e)) end = time() return { 'url': url, 'method': method, 'request_headers': headers, 'response_headers': {}, 'http_code': 0, 'redirects': [], 'body': b'', 'nettime': end - start, 'error': repr(e) } except timeoutError as e: logger.error(url + "::::::::" + repr(e)) end = time() return { 'url': url, 'method': method, 'request_headers': headers, 'response_headers': {}, 'http_code': 0, 'redirects': [], 'body': b'', 'nettime': end - start, 'error': repr(e) } except Exception as e: logger.exception(e) logger.error(url + "::::::::" + repr(e)) return { 'url': url, 'method': method, 'request_headers': headers, 'response_headers': {}, 'http_code': 0, 'redirects': [], 'body': b'', 'nettime': 0, 'error': repr(e) }
def crawl(urlInfo): uI = urlInfo execute = mgdb.execute_getbyid(urlInfo['execute_id']) if not execute: return False sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status" pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1}) try: ##如果任务已结束,则返回 #if execute['status'] == 2 or urlInfo['status'] == 2: # return True logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % ( uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url'] )) # 抓取页面,解析数据 response = {} urlItems = [] #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {} proxy = {} requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy) # 请求错误,直接返回 if requestInfo['error']: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 304或其他状态码,直接返回 if requestInfo['http_code'] != 200: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 正常请求 responseHeaders = requestInfo['response_headers'] contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None contentType = parseContentType(contentTypeRaw, default = 'text/html') fileType = mime2file(contentType) #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType) #保存响应信息 fileInfo = download(requestInfo, urlInfo, execute, fileType) response = _formatResponse(requestInfo, execute, urlInfo, fileInfo) mgdb.spiderurl_save(response, urlInfo['id']) #非html页面,直接返回 if fileType != 'html': return True #外部连接,不再进一步分析 if urlInfo['url_type'] != 'self': return True # 如果是单页面镜像,不分析页面 if execute['task_type'] == 'mirror_one': return True #正则解析页面 urlItems = parse_reg(requestInfo) #检测暗链 if pipingDark: result = parse_darklink(requestInfo['url']) # logger.info('parse_darklink::::%s::::' % (result)) darklinks = _formatUrls(result, 1) if result else [] urlItems = urlItems + darklinks ''' 浏览器解析部分 ''' #if execute['limit_js']: # results = parse_browser(requestInfo) # if results: urlItems = urlItems + results # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems))) # url去重 urlItems = _urls_uniq(urlItems) # 追加新的URL undos = [] mirrors = [] queueOut = [] outlinks = [] queueSite = [] # logger.info('parse_darklink::::%s::::' % (urlItems)) # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems))) for row in urlItems: url = row['url'].strip() if not isUrl(url): continue fileExtension = extension(url) urlType = _getDomainType(url, execute['domain']) # isExists = _checkUrlExists(execute['id'], url, row['method']) isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible']) if isExists: continue flagOutlink = 0 item = {} item['site_id'] = execute['site_id'] item['task_id'] = execute['task_id'] item['app_id'] = execute['app_id'] item['execute_id'] = execute['id'] item['task_type'] = execute['task_type'] item['url'] = url item['url_type'] = urlType item['file_extension'] = fileExtension item['method'] = row['method'] item['invisible'] = row['invisible'] item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else '' # 非本站链接或不分析暗链,状态标为5,即不需要抓取 item['status'] = 5 if urlType == 'self': item['status'] = 0 else: if fileExtension in staticExts: item['status'] = 0 else: if pipingDark: flagOutlink = 1 item['status'] = 0 if urlType == 'other': outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible'])) item['referer'] = urlInfo['url'] item['exec_level'] = execute['exec_level'] item['depth'] = int(urlInfo['depth']) + 1 item['query'] = row['query'] item['pattern_path'] = row['pattern_path'] item['pattern_query'] = row['pattern_query'] item['create_at'] = now_format() item['update_at'] = now_format() if flagOutlink: queueOut.append(item) else: queueSite.append(item) # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite))) # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut))) if urlItems: mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular')) if outlinks: mgdb.c_insert_batch('outlink', outlinks) stats = Mq.get_stats_batch('spider', execute['id']) if queueSite: # logger.info('parse_darklink::::::::%s' % (queueSite)) results = mgdb.c_insert_batch('spiderurl', queueSite) for item in results: # 状态位非0,不抓取 if item['status'] != 0: continue # 深度超过限制,不抓取 if item['depth'] > execute['limit_depth']: continue # 总数超过限制,不抓取 if stats['total'] > execute['limit_total']: continue # 镜像,不抓取图片 if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue # 单页面监测,不抓取子页面 if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue # 不抓取图片 if not execute['limit_image'] and item['file_extension'] in staticExts: continue item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] #数据放入待抓取队列 undos.append(item) #数据放入镜像队列 if execute['task_type'] == 'mirror': mirrors.append(item) if queueOut: # logger.info('parse_darklink::::::::%s' % (queueOut)) results = mgdb.c_insert_batch('spiderurl', queueOut) for item in results: item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] undos.append(item) if undos: Mq.produce(undos, 'spider') if mirrors: Mq.produce(mirrors, 'mirror') except Exception as e: logger.exception(e) return False