Ejemplo n.º 1
0
def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')):
    '''格式化UrlItem'''
    try:
        response = {}
        response['nettime'] =  requestInfo['nettime']
        if requestInfo['error'] or requestInfo['http_code'] != 200:
            response['status'] =  3
            response['http_code'] = requestInfo['http_code']
            response['error'] = repr(requestInfo['error'])
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
        else:
            response['status'] =  2
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
            response['md5_url'] =  md5(urlInfo['url'])
            response['md5_body'] =  md5(requestInfo['body'])
            response['redirects'] =  json.dumps(requestInfo['redirects'], ensure_ascii=False)
            response['http_code'] =  requestInfo['http_code']
            response['request_headers'] =  json.dumps(requestInfo['request_headers'], ensure_ascii=False)
            response['response_headers'] =  json.dumps(requestInfo['response_headers'], ensure_ascii=False)
            response['file_name'] = fileInfo[0]
            response['file_path'] = fileInfo[1]
        return response
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 2
0
def crawljs(taskInfo):
    try:
        #已抓取过,不再抓取
        if taskInfo['status'] not in (0, 1):
            return True
        #抓取页面源代码
        requestInfo = spiderRequest(taskInfo['url'])

        parseResults = []
        results = _parseForJs(taskInfo['url'])
        for record in results:
            urlRow = _parseForUrl(record)
            if not urlRow: continue
            parseResults.append(urlRow)
        updateRow = {}
        updateRow['id'] = taskInfo['id']
        updateRow['http_code'] = requestInfo['http_code']
        updateRow['response_headers'] = json.dumps(
            requestInfo['response_headers'], ensure_ascii=False)
        updateRow['body'] = requestInfo['body']
        updateRow['md5_body'] = md5(requestInfo['body'])
        updateRow['parse_result'] = json.dumps(parseResults,
                                               ensure_ascii=False)
        updateRow['status'] = 2
        updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S')
        #保存数据结果
        mg_spiderjsurl_save(updateRow)
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 3
0
def parse_browser(requestInfo):
    '''通过浏览器方式解析'''
    try:
        urls = []
        results = parseHref2(currentUrl, ['a', 'link'])
        if 'links' in results.keys() and results['links']:
            urls.extend(_formatUrls(results['links']))
        if 'ajaxs' in results.keys() and results['ajaxs']:
            urls.extend(_formatUrls(results['ajaxs']))
        if 'results' in results.keys() and results['results']:
            urls.extend(_formatUrls(results['results']))

        results = parseForms2(currentUrl)
        if 'links' in results.keys() and results['links']:
            urls.extend(_formatUrls(results['links']))
        if 'ajaxs' in results.keys() and results['ajaxs']:
            urls.extend(_formatUrls(results['ajaxs']))
        if 'results' in results.keys() and results['results']:
            urls.extend(_formatUrls(results['results']))

        #results = parseMouseEvent(currentUrl)
        #if 'links' in results.keys() and results['links']:
        #    urls.extend(_formatUrls(results['links']))
        #if 'ajaxs' in results.keys() and results['ajaxs']:
        #    urls.extend(_formatUrls(results['ajaxs']))
        #if 'results' in results.keys() and results['results']:
        #    urls.extend(_formatUrls(results['results']))

        return urls
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 4
0
def download(requestInfo, urlInfo, execute, fileType = 'html'):
    '''下载文件'''
    try:
        #如果有异常,直接返回
        md5Body = md5(requestInfo['body'])
        result = mgdb.static_get(execute['domain'], md5Body)
        if result: return (result['file_name'], result['file_key'])

        localfile = '%s/%s/%s.tmp' %  (PATH_TMP_UPLOAD, execute['domain'], md5Body)
        if not exists(dirname(localfile)):  mkdirs(dirname(localfile))
        if fileType == 'html':
            filename = "%s_%s.html" % (execute['id'], urlInfo['id'])
            filekey = 'html/%s/%s/%s_%s.html.%s' % (execute['domain'], execute['task_id'], execute['id'], urlInfo['id'], md5Body)
            fwriteBin(localfile, requestInfo['body'])
            fileType = 'html'
        else:
            filename = basename(requestInfo['url'])
            filekey = "static/%s/%s.%s" % (execute['domain'], requestInfo['url'][7:], md5Body)
            fwriteBin(localfile, requestInfo['body'])
            fileType = 'img'
        filepath = ydfs_upload(filekey, localfile)
        mgdb.c_insert('static', _formatStatic(execute['domain'], requestInfo['url'], filename, filekey, fileType, md5Body))
        return (filename, filekey)
    except Exception as e:
        logger.exception(e)
        return ('', '')
Ejemplo n.º 5
0
def codeJavascript(func):
    try:
        content = read(TEMPLATE_JAVASCRIPT)
        pattern = re.compile(r'//--' + func + '--\n(.*?)\n//--' + func + '--',
                             re.I | re.M | re.S)
        return pattern.findall(content)[0]
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 6
0
def codeCasper(func, params=None):
    try:
        content = read(TEMPLATE_CASPER)
        pattern = re.compile(r'//--' + func + '--\n(.*?)\n//--' + func + '--',
                             re.I | re.M | re.S)
        content = pattern.findall(content)[0]
        if params:
            for (k, v) in params.items():
                content = content.replace('###' + k + '###', v)
        return content
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 7
0
def parse_darklink(url = None):
    '''解析暗链'''
    try:
        cmd = 'phantomjs %s/safe_darklink.js %s' % (PATH_NODEJS, url)
        child = Popen(cmd, shell=True, close_fds=True, bufsize=-1, stdout=PIPE, stderr=STDOUT)
        output = child.stdout.read().decode().strip()
        # logger.info('parse_darklink::::%s::::%s' % (url, output))
        if output != 'fail': return json.loads(output)
        # logger.info('parse_darklink fail::::%s' % cmd)
        return False
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 8
0
def parseForms(url=None):
    try:
        js_parseForms = codeJavascript('parseForms')
        js_formatRelativeUrl = codeJavascript('formatRelativeUrl')
        casper_create = codeCasper('casper_create')
        casper_parseform = codeCasper('casper_parseform')
        casper_run = codeCasper('casper_run')
        casper_start = codeCasper('casper_start', {'startUrl': '%s' % url})
        content = "%s\n%s\n%s\n%s\n%s\n%s" % (
            js_parseForms, js_formatRelativeUrl, casper_create, casper_start,
            casper_parseform, casper_run)
        output = execCasper(content)
        return json.loads(output)
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 9
0
def execCasper(content=None):
    try:
        filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'),
                                 md5(content))
        write(filename, content)
        cmd = 'casperjs ' + filename
        child = Popen(cmd,
                      shell=True,
                      close_fds=True,
                      bufsize=-1,
                      stdout=PIPE,
                      stderr=STDOUT)
        output = child.stdout.read().decode()
        #remove(filename)
        return output
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 10
0
def parseHref(url=None, tags=None):
    try:
        jsFunc = 'parseHrefByTags'
        js_parseHrefByTags = codeJavascript('parseHrefByTags')
        casper_create = codeCasper('casper_create')
        casper_then = codeCasper('casper_then', {
            'jsfunc': jsFunc,
            'params': "['a', 'link']"
        })
        casper_run = codeCasper('casper_run')
        casper_start = codeCasper('casper_start', {'startUrl': '%s' % url})
        content = "%s\n%s\n%s\n%s\n%s" % (js_parseHrefByTags, casper_create,
                                          casper_start, casper_then,
                                          casper_run)
        output = execCasper(content)
        return json.loads(output)
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 11
0
def parseMouseEvent(url=None):
    '''解析鼠标事件
    鼠标事件包括: 'onclick', 'ondbclick', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup'
    '''
    try:
        codes = []
        tags = [
            'a', 'div', 'span', 'table', 'tr', 'td', 'th', 'button', 'input'
        ]
        events = [
            'onclick', 'ondbclick', 'onmousedown', 'onmousemove', 'onmouseout',
            'onmouseover', 'onmouseup'
        ]
        rows = captureEvent(url, tags, events)
        eventCodes = []
        for index, row in enumerate(rows):
            # eventRow 示例:
            #{'tag':'div', 'eventName': 'onclick', 'index': 1}
            casper_mouse_event = codeCasper(
                'casper_event', {
                    'tag': row['tag'],
                    'index': row['index'],
                    'eventName': row['event']
                })
            eventCodes.append(casper_mouse_event)
        js_execEvent = codeJavascript('execEvent')
        casper_create = codeCasper('casper_create')
        casper_wait = codeCasper('casper_wait')
        casper_start = codeCasper('casper_start', {'startUrl': '%s' % url})
        casper_output = codeCasper('casper_output')
        casper_run = codeCasper('casper_run')
        content = "%s\n%s\n%s\n%s\n%s\n%s\n%s" % (
            js_execEvent, casper_create, casper_start, casper_wait,
            "\n".join(eventCodes), casper_output, casper_run)
        output = execCasper(content)
        return json.loads(output)
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 12
0
def parse_reg(requestInfo):
    try:
        # 内容为html, 检测编码,并进行解码
        try:
            charset = chardet.detect(requestInfo['body'])['encoding']
            charset = formatCharset(charset)
            body = requestInfo['body'].decode(charset)
        except Exception as e:
            charset = 'GBK' if charset == 'GB2312' else 'GBK'
            body = requestInfo['body'].decode(charset)
            logger.exception(e)

        #使用正则解析
        urls = []
        currentUrl = requestInfo['redirects'][-1]['url'] if requestInfo['redirects'] else requestInfo['url']
        urls.extend(_formatUrls(parseUrlByMatchQuotes(currentUrl, body)))      #双引号及单引号中间的链接
        urls.extend(_formatUrls(parseHref(currentUrl, body)))                  #解析href
        urls.extend(_formatUrls(parseSrc(currentUrl, body)))                   #解析src    img script frame iframe
        urls.extend(_formatUrls(parseUrlFromJs(currentUrl, body))) #从JS中解析URL
        return urls
    except Exception as e:
        logger.exception(e)
        return []
Ejemplo n.º 13
0
def captureEvent(url=None, tags=None, events=['onclick']):
    try:
        jsFunc = 'parseEventByTags'
        js_parseEventByTags = codeJavascript(jsFunc)
        casper_create = codeCasper('casper_create')
        casper_then = codeCasper(
            'casper_then', {
                'jsfunc':
                jsFunc,
                'params':
                "%s, %s" % (json.dumps(tags, ensure_ascii=False),
                            json.dumps(events, ensure_ascii=False))
            })
        casper_run = codeCasper('casper_run')
        casper_start = codeCasper('casper_start', {'startUrl': '%s' % url})
        content = "%s\n%s\n%s\n%s\n%s" % (js_parseEventByTags, casper_create,
                                          casper_start, casper_then,
                                          casper_run)
        output = execCasper(content)
        jsonData = json.loads(output)
        return jsonData['results']
    except Exception as e:
        logger.exception(e)
        return False
Ejemplo n.º 14
0
def spiderRequest(url=None,
                  method="GET",
                  data={},
                  headers={},
                  timeout=10,
                  auth={},
                  proxy={}):
    headers['Cache-Control'] = 'no-chache'
    method = method.upper()
    start = time()
    try:
        #跳转记录
        redirect_handler = RedirectHandler()

        #basic验证
        auth_handler = HTTPBasicAuthHandler()
        if auth and 'user' in auth.keys() and 'passwd' in auth.keys():
            passwdHandler = HTTPPasswordMgrWithDefaultRealm()
            passwdHandler.add_password(realm=None,
                                       uri=url,
                                       user=auth['user'],
                                       passwd=auth['passwd'])
            auth_handler = HTTPBasicAuthHandler(passwdHandler)

        #代理
        proxy_handler = ProxyHandler()
        if proxy and 'url' in proxy.keys():
            proxy_handler = ProxyHandler({'http': proxy['url']})

        #代理验证
        proxy_auth_handler = ProxyBasicAuthHandler()
        if proxy and 'url' in proxy.keys() and 'user' in proxy.keys(
        ) and 'passwd' in proxy.keys():
            proxyPasswdHandler = HTTPPasswordMgrWithDefaultRealm()
            proxyPasswdHandler.add_password(realm=None,
                                            uri=proxy['url'],
                                            user=proxy['user'],
                                            passwd=proxy['passwd'])
            proxy_auth_handler = ProxyBasicAuthHandler(proxyPasswdHandler)

        opener = build_opener(redirect_handler, auth_handler, proxy_handler,
                              proxy_auth_handler)
        request_handler = Request(quote(url, safe=string.printable),
                                  method=method)
        for key, value in headers.items():
            request_handler.add_header(key, value)
        response = opener.open(request_handler, timeout=timeout)
        end = time()
        return {
            'url': url,
            'method': method,
            'request_headers': request_handler.headers,
            'response_headers': formatHeaders(response.getheaders()),
            'http_code': response.status,
            'redirects': redirect_handler.redirects,
            'body': response.read(),
            'nettime': end - start,
            'error': ''
        }
    except HTTPError as e:  # 400 401 402 403 500 501 502 503 504
        logger.error(url + "::::::::" + repr(e))
        end = time()
        return {
            'url': url,
            'method': method,
            'request_headers': headers,
            'response_headers': dict(e.headers),
            'http_code': e.code,
            'redirects': [],
            'body': b'',
            'nettime': end - start,
            'error': repr(e)
        }
    except URLError as e:
        logger.error(url + "::::::::" + repr(e))
        end = time()
        return {
            'url': url,
            'method': method,
            'request_headers': headers,
            'response_headers': {},
            'http_code': 0,
            'redirects': [],
            'body': b'',
            'nettime': end - start,
            'error': repr(e)
        }
    except timeoutError as e:
        logger.error(url + "::::::::" + repr(e))
        end = time()
        return {
            'url': url,
            'method': method,
            'request_headers': headers,
            'response_headers': {},
            'http_code': 0,
            'redirects': [],
            'body': b'',
            'nettime': end - start,
            'error': repr(e)
        }
    except Exception as e:
        logger.exception(e)
        logger.error(url + "::::::::" + repr(e))
        return {
            'url': url,
            'method': method,
            'request_headers': headers,
            'response_headers': {},
            'http_code': 0,
            'redirects': [],
            'body': b'',
            'nettime': 0,
            'error': repr(e)
        }
Ejemplo n.º 15
0
def crawl(urlInfo):
    uI = urlInfo
    execute = mgdb.execute_getbyid(urlInfo['execute_id'])
    if not execute: return False
    sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status"
    pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1})

    try:
        ##如果任务已结束,则返回
        #if execute['status'] == 2 or urlInfo['status'] == 2:
        #    return True

        logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % (
            uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url']
        ))

        # 抓取页面,解析数据
        response = {}
        urlItems = []
        #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {}
        proxy = {}
        requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy)

        # 请求错误,直接返回
        if requestInfo['error']:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 304或其他状态码,直接返回
        if requestInfo['http_code'] != 200:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 正常请求
        responseHeaders = requestInfo['response_headers']
        contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None
        contentType = parseContentType(contentTypeRaw, default = 'text/html')
        fileType = mime2file(contentType)
        #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType)

        #保存响应信息
        fileInfo = download(requestInfo, urlInfo, execute, fileType)
        response = _formatResponse(requestInfo, execute, urlInfo, fileInfo)
        mgdb.spiderurl_save(response, urlInfo['id'])

        #非html页面,直接返回
        if fileType != 'html': return True

        #外部连接,不再进一步分析
        if urlInfo['url_type'] != 'self': return True

        # 如果是单页面镜像,不分析页面
        if execute['task_type'] == 'mirror_one': return True

        #正则解析页面
        urlItems = parse_reg(requestInfo)
        #检测暗链
        if pipingDark:
            result = parse_darklink(requestInfo['url'])
            # logger.info('parse_darklink::::%s::::' % (result))
            darklinks = _formatUrls(result, 1) if result else []
            urlItems = urlItems + darklinks

        '''
        浏览器解析部分
        '''
        #if execute['limit_js']:
        #    results = parse_browser(requestInfo)
        #    if results: urlItems = urlItems + results

        # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems)))
        # url去重
        urlItems = _urls_uniq(urlItems)
        # 追加新的URL
        undos = []
        mirrors = []
        queueOut = []
        outlinks = []
        queueSite = []
        # logger.info('parse_darklink::::%s::::' % (urlItems))
        # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems)))
        for row in urlItems:
            url = row['url'].strip()
            if not isUrl(url): continue

            fileExtension = extension(url)

            urlType = _getDomainType(url, execute['domain'])
            # isExists = _checkUrlExists(execute['id'], url, row['method'])
            isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible'])
            if isExists: continue

            flagOutlink = 0
            item = {}
            item['site_id'] = execute['site_id']
            item['task_id'] = execute['task_id']
            item['app_id'] = execute['app_id']
            item['execute_id'] = execute['id']
            item['task_type'] = execute['task_type']
            item['url'] = url
            item['url_type'] = urlType
            item['file_extension'] = fileExtension
            item['method'] = row['method']
            item['invisible'] = row['invisible']
            item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else ''

            # 非本站链接或不分析暗链,状态标为5,即不需要抓取

            item['status'] = 5
            if urlType == 'self':
                item['status'] = 0
            else:
                if fileExtension in staticExts:
                    item['status'] = 0
                else:
                    if pipingDark: 
                        flagOutlink = 1
                        item['status'] = 0
            if urlType == 'other': 
                outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible']))
            item['referer'] = urlInfo['url']
            item['exec_level'] = execute['exec_level']
            item['depth'] = int(urlInfo['depth']) + 1
            item['query'] = row['query']
            item['pattern_path'] = row['pattern_path']
            item['pattern_query'] = row['pattern_query']
            item['create_at'] = now_format()
            item['update_at'] = now_format()
            if flagOutlink:
                queueOut.append(item)
            else:
                queueSite.append(item)

        # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite)))
        # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut)))
        if urlItems:
            mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular'))
        if outlinks: mgdb.c_insert_batch('outlink', outlinks)
        stats = Mq.get_stats_batch('spider', execute['id'])
        if queueSite:
            # logger.info('parse_darklink::::::::%s' % (queueSite))
            results = mgdb.c_insert_batch('spiderurl', queueSite)
            for item in results:
                # 状态位非0,不抓取
                if item['status'] != 0: continue
                # 深度超过限制,不抓取
                if item['depth'] > execute['limit_depth']: continue
                # 总数超过限制,不抓取
                if stats['total'] > execute['limit_total']: continue
                # 镜像,不抓取图片
                if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue
                # 单页面监测,不抓取子页面
                if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue
                # 不抓取图片
                if not execute['limit_image'] and item['file_extension'] in staticExts: continue
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']

                #数据放入待抓取队列
                undos.append(item)

                #数据放入镜像队列
                if execute['task_type'] == 'mirror': mirrors.append(item)
        if queueOut:
            # logger.info('parse_darklink::::::::%s' % (queueOut))
            results = mgdb.c_insert_batch('spiderurl', queueOut)
            for item in results: 
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']
                undos.append(item)
        if undos: Mq.produce(undos, 'spider')
        if mirrors: Mq.produce(mirrors, 'mirror')

    except Exception as e:
        logger.exception(e)
        return False