Beispiel #1
0
def _parseForUrl(row={}):
    '''解析URL记录'''
    url = row['url'].strip()
    if not isUrl(url): return {}
    method = row['method'].upper()
    post = json.dumps(row['post'], ensure_ascii=False) if row['post'] else ''

    return {
        'url': url,
        'method': method,
        'post': post,
    }
Beispiel #2
0
def parseUrlByMatchQuotes(url, content):
    urls = []
    match = re.findall(r"('|\")(http|https)://(.+?)\1", content, re.I)
    for row in match:
        urls.append("%s://%s" % (row[1], row[2]))
    rows = []
    for row in urls:
        if not row:
            continue
        if not isUrl(row):
            row = formatRelativeUrl(url, row)
        rows.append(row)
    return list(set(rows))
Beispiel #3
0
def parseUrlFromJs(url, content):
    urls = []
    if content.find('url') >= 0:
        match = re.findall(r"[^_]url(\s*)=(\s*)(\"|')(.+?)\3", content, re.I)
        for row in match:
            urls.append(row[3])

    if content.find('.href') >= 0:
        match = re.findall(r"\.href(\s*)=(\s*)(\"|')(.+?)\3", content, re.I)
        for row in match:
            urls.append(row[3])

    if content.find('window.open') >= 0:
        match = re.findall(r"window\.open(\s*)\((\s*)('|\")(.+?)\3(,?)",
                           content, re.I)
        for row in match:
            urls.append(row[3])

    if content.find('window.navigate') >= 0:
        match = re.findall(r"window\.navigate(\s*)\((\s*)('|\")(.+?)\3",
                           content, re.I)
        for row in match:
            urls.append(row[3])

    if content.find('.location') >= 0:
        match = re.findall(r"\.location(\s*)=(\s*)('|\")(.+?)\3", content,
                           re.I)
        for row in match:
            urls.append(row[3])

    if content.find('location.replace') >= 0 or content.find(
            'location.assign') >= 0:
        match = re.findall(
            r"location\.(replace|assign)(\s*)\((\s*)('|\")(.+?)\4", content,
            re.I)
        for row in match:
            urls.append(row[4])
    rows = []
    for row in urls:
        if not row:
            continue
        if not isUrl(row):
            row = formatRelativeUrl(url, row)
        rows.append(row)
    return list(set(rows))
Beispiel #4
0
def parseSrc(url, content):
    urls = []
    if content != '' and (content.find('src') >= 0
                          or content.find('SRC') >= 0):
        match = re.findall(r"src(\s*)=(\s*)('|\")(.*?)\3", content, re.I)
        for row in match:
            if row[3] != '':
                urls.append(row[3])
        match = re.findall(r"src(\s*)=(\s*)([\d\w#].*?)(/>|>| )", content,
                           re.I)
        if len(match) > 0:
            for row in match:
                urls.append(row[2])
    rows = []
    for row in urls:
        if not row:
            continue
        if not isUrl(row):
            row = formatRelativeUrl(url, row)
        rows.append(row)
    return list(set(rows))
Beispiel #5
0
def parseHref(url, content):
    urls = []
    if content != '' and (content.find('href') > 0
                          or content.find('HREF') > 0):
        match = re.findall(r"(\s+)href(\s*)=(\s*)('|\")(.*?)\4(.*?)>(.*?)<",
                           content, re.I | re.DOTALL)
        if len(match) > 0:
            for row in match:
                if row[4] != '':
                    urls.append(row[4])
        match = re.findall(r"(\s+)href(\s*)=(\s*)([\d\w#].*?)(/>|>| )",
                           content, re.I | re.DOTALL)
        if len(match) > 0:
            for row in match:
                urls.append(row[3])
    rows = []
    for row in urls:
        if not row:
            continue
        if not isUrl(row):
            row = formatRelativeUrl(url, row)
        rows.append(row)
    return list(set(rows))
Beispiel #6
0
def crawl(urlInfo):
    uI = urlInfo
    execute = mgdb.execute_getbyid(urlInfo['execute_id'])
    if not execute: return False
    sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status"
    pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1})

    try:
        ##如果任务已结束,则返回
        #if execute['status'] == 2 or urlInfo['status'] == 2:
        #    return True

        logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % (
            uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url']
        ))

        # 抓取页面,解析数据
        response = {}
        urlItems = []
        #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {}
        proxy = {}
        requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy)

        # 请求错误,直接返回
        if requestInfo['error']:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 304或其他状态码,直接返回
        if requestInfo['http_code'] != 200:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 正常请求
        responseHeaders = requestInfo['response_headers']
        contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None
        contentType = parseContentType(contentTypeRaw, default = 'text/html')
        fileType = mime2file(contentType)
        #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType)

        #保存响应信息
        fileInfo = download(requestInfo, urlInfo, execute, fileType)
        response = _formatResponse(requestInfo, execute, urlInfo, fileInfo)
        mgdb.spiderurl_save(response, urlInfo['id'])

        #非html页面,直接返回
        if fileType != 'html': return True

        #外部连接,不再进一步分析
        if urlInfo['url_type'] != 'self': return True

        # 如果是单页面镜像,不分析页面
        if execute['task_type'] == 'mirror_one': return True

        #正则解析页面
        urlItems = parse_reg(requestInfo)
        #检测暗链
        if pipingDark:
            result = parse_darklink(requestInfo['url'])
            # logger.info('parse_darklink::::%s::::' % (result))
            darklinks = _formatUrls(result, 1) if result else []
            urlItems = urlItems + darklinks

        '''
        浏览器解析部分
        '''
        #if execute['limit_js']:
        #    results = parse_browser(requestInfo)
        #    if results: urlItems = urlItems + results

        # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems)))
        # url去重
        urlItems = _urls_uniq(urlItems)
        # 追加新的URL
        undos = []
        mirrors = []
        queueOut = []
        outlinks = []
        queueSite = []
        # logger.info('parse_darklink::::%s::::' % (urlItems))
        # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems)))
        for row in urlItems:
            url = row['url'].strip()
            if not isUrl(url): continue

            fileExtension = extension(url)

            urlType = _getDomainType(url, execute['domain'])
            # isExists = _checkUrlExists(execute['id'], url, row['method'])
            isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible'])
            if isExists: continue

            flagOutlink = 0
            item = {}
            item['site_id'] = execute['site_id']
            item['task_id'] = execute['task_id']
            item['app_id'] = execute['app_id']
            item['execute_id'] = execute['id']
            item['task_type'] = execute['task_type']
            item['url'] = url
            item['url_type'] = urlType
            item['file_extension'] = fileExtension
            item['method'] = row['method']
            item['invisible'] = row['invisible']
            item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else ''

            # 非本站链接或不分析暗链,状态标为5,即不需要抓取

            item['status'] = 5
            if urlType == 'self':
                item['status'] = 0
            else:
                if fileExtension in staticExts:
                    item['status'] = 0
                else:
                    if pipingDark: 
                        flagOutlink = 1
                        item['status'] = 0
            if urlType == 'other': 
                outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible']))
            item['referer'] = urlInfo['url']
            item['exec_level'] = execute['exec_level']
            item['depth'] = int(urlInfo['depth']) + 1
            item['query'] = row['query']
            item['pattern_path'] = row['pattern_path']
            item['pattern_query'] = row['pattern_query']
            item['create_at'] = now_format()
            item['update_at'] = now_format()
            if flagOutlink:
                queueOut.append(item)
            else:
                queueSite.append(item)

        # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite)))
        # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut)))
        if urlItems:
            mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular'))
        if outlinks: mgdb.c_insert_batch('outlink', outlinks)
        stats = Mq.get_stats_batch('spider', execute['id'])
        if queueSite:
            # logger.info('parse_darklink::::::::%s' % (queueSite))
            results = mgdb.c_insert_batch('spiderurl', queueSite)
            for item in results:
                # 状态位非0,不抓取
                if item['status'] != 0: continue
                # 深度超过限制,不抓取
                if item['depth'] > execute['limit_depth']: continue
                # 总数超过限制,不抓取
                if stats['total'] > execute['limit_total']: continue
                # 镜像,不抓取图片
                if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue
                # 单页面监测,不抓取子页面
                if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue
                # 不抓取图片
                if not execute['limit_image'] and item['file_extension'] in staticExts: continue
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']

                #数据放入待抓取队列
                undos.append(item)

                #数据放入镜像队列
                if execute['task_type'] == 'mirror': mirrors.append(item)
        if queueOut:
            # logger.info('parse_darklink::::::::%s' % (queueOut))
            results = mgdb.c_insert_batch('spiderurl', queueOut)
            for item in results: 
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']
                undos.append(item)
        if undos: Mq.produce(undos, 'spider')
        if mirrors: Mq.produce(mirrors, 'mirror')

    except Exception as e:
        logger.exception(e)
        return False