def _parseForUrl(row={}): '''解析URL记录''' url = row['url'].strip() if not isUrl(url): return {} method = row['method'].upper() post = json.dumps(row['post'], ensure_ascii=False) if row['post'] else '' return { 'url': url, 'method': method, 'post': post, }
def parseUrlByMatchQuotes(url, content): urls = [] match = re.findall(r"('|\")(http|https)://(.+?)\1", content, re.I) for row in match: urls.append("%s://%s" % (row[1], row[2])) rows = [] for row in urls: if not row: continue if not isUrl(row): row = formatRelativeUrl(url, row) rows.append(row) return list(set(rows))
def parseUrlFromJs(url, content): urls = [] if content.find('url') >= 0: match = re.findall(r"[^_]url(\s*)=(\s*)(\"|')(.+?)\3", content, re.I) for row in match: urls.append(row[3]) if content.find('.href') >= 0: match = re.findall(r"\.href(\s*)=(\s*)(\"|')(.+?)\3", content, re.I) for row in match: urls.append(row[3]) if content.find('window.open') >= 0: match = re.findall(r"window\.open(\s*)\((\s*)('|\")(.+?)\3(,?)", content, re.I) for row in match: urls.append(row[3]) if content.find('window.navigate') >= 0: match = re.findall(r"window\.navigate(\s*)\((\s*)('|\")(.+?)\3", content, re.I) for row in match: urls.append(row[3]) if content.find('.location') >= 0: match = re.findall(r"\.location(\s*)=(\s*)('|\")(.+?)\3", content, re.I) for row in match: urls.append(row[3]) if content.find('location.replace') >= 0 or content.find( 'location.assign') >= 0: match = re.findall( r"location\.(replace|assign)(\s*)\((\s*)('|\")(.+?)\4", content, re.I) for row in match: urls.append(row[4]) rows = [] for row in urls: if not row: continue if not isUrl(row): row = formatRelativeUrl(url, row) rows.append(row) return list(set(rows))
def parseSrc(url, content): urls = [] if content != '' and (content.find('src') >= 0 or content.find('SRC') >= 0): match = re.findall(r"src(\s*)=(\s*)('|\")(.*?)\3", content, re.I) for row in match: if row[3] != '': urls.append(row[3]) match = re.findall(r"src(\s*)=(\s*)([\d\w#].*?)(/>|>| )", content, re.I) if len(match) > 0: for row in match: urls.append(row[2]) rows = [] for row in urls: if not row: continue if not isUrl(row): row = formatRelativeUrl(url, row) rows.append(row) return list(set(rows))
def parseHref(url, content): urls = [] if content != '' and (content.find('href') > 0 or content.find('HREF') > 0): match = re.findall(r"(\s+)href(\s*)=(\s*)('|\")(.*?)\4(.*?)>(.*?)<", content, re.I | re.DOTALL) if len(match) > 0: for row in match: if row[4] != '': urls.append(row[4]) match = re.findall(r"(\s+)href(\s*)=(\s*)([\d\w#].*?)(/>|>| )", content, re.I | re.DOTALL) if len(match) > 0: for row in match: urls.append(row[3]) rows = [] for row in urls: if not row: continue if not isUrl(row): row = formatRelativeUrl(url, row) rows.append(row) return list(set(rows))
def crawl(urlInfo): uI = urlInfo execute = mgdb.execute_getbyid(urlInfo['execute_id']) if not execute: return False sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status" pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1}) try: ##如果任务已结束,则返回 #if execute['status'] == 2 or urlInfo['status'] == 2: # return True logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % ( uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url'] )) # 抓取页面,解析数据 response = {} urlItems = [] #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {} proxy = {} requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy) # 请求错误,直接返回 if requestInfo['error']: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 304或其他状态码,直接返回 if requestInfo['http_code'] != 200: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 正常请求 responseHeaders = requestInfo['response_headers'] contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None contentType = parseContentType(contentTypeRaw, default = 'text/html') fileType = mime2file(contentType) #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType) #保存响应信息 fileInfo = download(requestInfo, urlInfo, execute, fileType) response = _formatResponse(requestInfo, execute, urlInfo, fileInfo) mgdb.spiderurl_save(response, urlInfo['id']) #非html页面,直接返回 if fileType != 'html': return True #外部连接,不再进一步分析 if urlInfo['url_type'] != 'self': return True # 如果是单页面镜像,不分析页面 if execute['task_type'] == 'mirror_one': return True #正则解析页面 urlItems = parse_reg(requestInfo) #检测暗链 if pipingDark: result = parse_darklink(requestInfo['url']) # logger.info('parse_darklink::::%s::::' % (result)) darklinks = _formatUrls(result, 1) if result else [] urlItems = urlItems + darklinks ''' 浏览器解析部分 ''' #if execute['limit_js']: # results = parse_browser(requestInfo) # if results: urlItems = urlItems + results # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems))) # url去重 urlItems = _urls_uniq(urlItems) # 追加新的URL undos = [] mirrors = [] queueOut = [] outlinks = [] queueSite = [] # logger.info('parse_darklink::::%s::::' % (urlItems)) # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems))) for row in urlItems: url = row['url'].strip() if not isUrl(url): continue fileExtension = extension(url) urlType = _getDomainType(url, execute['domain']) # isExists = _checkUrlExists(execute['id'], url, row['method']) isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible']) if isExists: continue flagOutlink = 0 item = {} item['site_id'] = execute['site_id'] item['task_id'] = execute['task_id'] item['app_id'] = execute['app_id'] item['execute_id'] = execute['id'] item['task_type'] = execute['task_type'] item['url'] = url item['url_type'] = urlType item['file_extension'] = fileExtension item['method'] = row['method'] item['invisible'] = row['invisible'] item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else '' # 非本站链接或不分析暗链,状态标为5,即不需要抓取 item['status'] = 5 if urlType == 'self': item['status'] = 0 else: if fileExtension in staticExts: item['status'] = 0 else: if pipingDark: flagOutlink = 1 item['status'] = 0 if urlType == 'other': outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible'])) item['referer'] = urlInfo['url'] item['exec_level'] = execute['exec_level'] item['depth'] = int(urlInfo['depth']) + 1 item['query'] = row['query'] item['pattern_path'] = row['pattern_path'] item['pattern_query'] = row['pattern_query'] item['create_at'] = now_format() item['update_at'] = now_format() if flagOutlink: queueOut.append(item) else: queueSite.append(item) # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite))) # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut))) if urlItems: mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular')) if outlinks: mgdb.c_insert_batch('outlink', outlinks) stats = Mq.get_stats_batch('spider', execute['id']) if queueSite: # logger.info('parse_darklink::::::::%s' % (queueSite)) results = mgdb.c_insert_batch('spiderurl', queueSite) for item in results: # 状态位非0,不抓取 if item['status'] != 0: continue # 深度超过限制,不抓取 if item['depth'] > execute['limit_depth']: continue # 总数超过限制,不抓取 if stats['total'] > execute['limit_total']: continue # 镜像,不抓取图片 if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue # 单页面监测,不抓取子页面 if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue # 不抓取图片 if not execute['limit_image'] and item['file_extension'] in staticExts: continue item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] #数据放入待抓取队列 undos.append(item) #数据放入镜像队列 if execute['task_type'] == 'mirror': mirrors.append(item) if queueOut: # logger.info('parse_darklink::::::::%s' % (queueOut)) results = mgdb.c_insert_batch('spiderurl', queueOut) for item in results: item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] undos.append(item) if undos: Mq.produce(undos, 'spider') if mirrors: Mq.produce(mirrors, 'mirror') except Exception as e: logger.exception(e) return False