Example #1
0
def check():
    setting_col.update_one({'key': 'task'}, {'$set': {'key': 'task', 'pid': os.getpid()}}, upsert=True)
    query_count = query_col.count({'enabled': True})
    logger.info('需要处理的关键词总数: {}'.format(query_count))
    if query_count:
        logger.info('需要处理的关键词总数: {}'.format(query_count))
    else:
        logger.warning('请添加关键词')
        return
    if github_col.count({'rate_remaining': {'$gt': 5}}):
        pass
    else:
        logger.error('请配置github账号')
        return

    if setting_col.count({'key': 'task', 'page': {'$exists': True}}):
        setting_col.update_one({'key': 'task'}, {'$set': {'pid': os.getpid()}})
        page = int(setting_col.find_one({'key': 'task'}).get('page'))
        for p in range(0, page):
            for query in query_col.find({'enabled': True}).sort('last', ASCENDING):
                github_account = random.choice(
                    list(github_col.find({"rate_limit": {"$gt": 5}}).sort('rate_remaining', DESCENDING)))
                github_username = github_account.get('username')
                github_password = github_account.get('password')
                github_token = github_account.get('token')
                rate_remaining = github_account.get('rate_remaining')
                logger.info(github_username)
                logger.info(rate_remaining)
                g = Github(github_username, github_token,
                           user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36')
                search.schedule(args=(query, p, g, github_username),
                                delay=huey.pending_count() + huey.scheduled_count())
    else:
        logger.error('请在页面上配置任务参数')
Example #2
0
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('webhook', type=str, required=True, help='WebHook URL')
        parser.add_argument('domain', type=str, help='System URL Host')
        parser.add_argument('enabled', type=inputs.boolean, default=False, help='Enabled Notice')
        parser.add_argument('test', type=inputs.boolean, default=False, help='Test Notice')
        args = parser.parse_args()
        if urlparse(args.get('webhook')).netloc not in ['oapi.dingtalk.com', 'qyapi.weixin.qq.com'] or urlparse(
                args.get('webhook')).scheme != 'https':
            data = {'status': 400, 'msg': '错误的 webhook 地址', 'result': []}
            return jsonify(data)
        if args.get('test'):
            if urlparse(args.get('webhook')).netloc == 'oapi.dingtalk.com':
                test_content = {
                    "msgtype": "markdown",
                    "markdown": {"title": "GitHub泄露",
                                 "text": '### 规则名称: [WebHook告警测试]({})'.format(args.get('domain'))
                                 },
                    "at": {
                        "atMobiles": [

                        ],
                        "isAtAll": False
                    }
                }
            else:
                test_content = {
                    "msgtype": "markdown",
                    "markdown": {
                        "content": '### 规则名称: [WebHook告警测试]({})'.format(args.get('domain'))
                    }
                }

            response = requests.post(
                args.get('webhook'),
                json=test_content)
            if response.ok:
                if response.json().get('errmsg') == 'ok':
                    data = {'status': 201, 'msg': '已发送,请前往钉钉/企业微信群查看', 'result': []}
                else:
                    data = {'status': 400, 'msg': '发送失败,WebHook 响应: {}'.format(response.json().get('errmsg')),
                            'result': []}
                return jsonify(data)
            else:
                data = {'status': 400, 'msg': '发送失败,请检查服务器网络', 'result': []}
                return jsonify(data)
        del args['test']
        setting_col.update_one({'webhook': args.get('webhook')}, {'$set': args}, upsert=True)
        result = setting_col.count({'webhook': args.get('webhook')})
        if result > 0:
            data = {'status': 201, 'msg': '设置成功', 'result': result}
        else:
            data = {'status': 400, 'msg': '设置失败', 'result': result}
        return jsonify(data)
Example #3
0
def search(query, page, g, github_username):
    mail_notice_list = []
    webhook_notice_list = []
    logger.info('开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    try:
        repos = g.search_code(query=query.get('keyword'),
                              sort="indexed",
                              order="desc")
        github_col.update_one({'username': github_username}, {
            '$set': {
                'rate_remaining': int(g.get_rate_limit().search.remaining)
            }
        })

    except Exception as error:
        logger.critical(error)
        logger.critical("触发限制啦")
        return
    try:
        for repo in repos.get_page(page):
            setting_col.update_one({'key': 'task'}, {
                '$set': {
                    'key': 'task',
                    'pid': os.getpid(),
                    'last': timestamp()
                }
            },
                                   upsert=True)
            if not result_col.count({'_id': repo.sha}):
                try:
                    code = str(repo.content).replace('\n', '')
                except:
                    code = ''
                leakage = {
                    'link': repo.html_url,
                    'project': repo.repository.full_name,
                    'project_url': repo.repository.html_url,
                    '_id': repo.sha,
                    'language': repo.repository.language,
                    'username': repo.repository.owner.login,
                    'avatar_url': repo.repository.owner.avatar_url,
                    'filepath': repo.path,
                    'filename': repo.name,
                    'security': 0,
                    'ignore': 0,
                    'tag': query.get('tag'),
                    'code': code,
                }
                try:
                    leakage['affect'] = get_affect_assets(repo.decoded_content)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                if int(repo.raw_headers.get('x-ratelimit-remaining')) == 0:
                    logger.critical('剩余使用次数: {}'.format(
                        repo.raw_headers.get('x-ratelimit-remaining')))
                    return
                last_modified = datetime.datetime.strptime(
                    repo.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
                leakage['datetime'] = last_modified
                leakage['timestamp'] = last_modified.timestamp()
                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue
                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue
                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('.')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))
                try:
                    result_col.insert_one(leakage)
                    logger.info(leakage.get('project'))
                except errors.DuplicateKeyError:
                    logger.info('已存在')

                logger.info('抓取关键字:{} {}'.format(query.get('tag'),
                                                 leakage.get('link')))
    except Exception as error:
        if 'Not Found' not in error.data:
            g, github_username = new_github()
            search.schedule(args=(query, page, g, github_username),
                            delay=huey.pending_count() +
                            huey.scheduled_count())
        logger.critical(error)
        logger.error('抓取: tag is {} keyword is {}, page is {} 失败'.format(
            query.get('tag'), query.get('keyword'), page + 1))

        return
    logger.info('抓取: tag is {} keyword is {}, page is {} 成功'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    query_col.update_one({'tag': query.get('tag')}, {
        '$set': {
            'last': int(time.time()),
            'status': 1,
            'reason': '抓取第{}页成功'.format(page),
            'api_total': repos.totalCount,
            'found_total': result_col.count({'tag': query.get('tag')})
        }
    })
    if setting_col.count({
            'key': 'mail',
            'enabled': True
    }) and len(mail_notice_list):
        main_content = '<h2>规则名称: {}</h2><br>{}'.format(
            query.get('tag'), '<br>'.join(mail_notice_list))
        send_mail(main_content)
    logger.info(len(webhook_notice_list))
    webhook_notice(query.get('tag'), webhook_notice_list)
Example #4
0
from pymongo import errors, DESCENDING, ASCENDING
from config.database import result_col, query_col, blacklist_col, notice_col, github_col, setting_col, REDIS_HOST, \
    REDIS_PORT
from utils.date import timestamp
from utils.log import logger
from utils.notice import mail_notice

huey = RedisHuey('hawkeye', host=REDIS_HOST, port=int(REDIS_PORT))
base_path = os.path.split(os.path.realpath(__file__))[0]
extract = tldextract.TLDExtract(cache_file='{}/.tld_set'.format(base_path))

if setting_col.count({
        'key': 'task',
        'minute': {
            '$exists': True
        },
        'page': {
            '$exists': True
        }
}):
    minute = int(setting_col.find_one({'key': 'task'}).get('minute'))
    setting_col.update_one(
        {'key': 'task'},
        {'$set': {
            'key': 'task',
            'pid': os.getpid(),
            'last': timestamp()
        }},
        upsert=True)

else:
Example #5
0
 def get(self):
     parser = reqparse.RequestParser()
     parser.add_argument('tag', type=str, help='')
     args = parser.parse_args()
     tag = args.get('tag')
     if tag:
         total = {
             'total':
             result_col.count({'tag': tag}),
             'ignore':
             result_col.count({
                 'tag': tag,
                 'security': 1
             }),
             'risk':
             result_col.count({
                 'tag': tag,
                 'security': 0,
                 "desc": {
                     "$exists": True
                 }
             })
         }
         today = {
             'total':
             result_col.count({
                 'tag': tag,
                 'timestamp': {
                     '$gte': today_start()
                 }
             }),
             'ignore':
             result_col.count({
                 'tag': tag,
                 'timestamp': {
                     '$gte': today_start()
                 },
                 'security': 1
             }),
             'risk':
             result_col.count({
                 'tag': tag,
                 'timestamp': {
                     '$gte': today_start()
                 },
                 'security': 0,
                 "desc": {
                     "$exists": True
                 }
             }),
         }
     else:
         total = {
             'total':
             result_col.count(),
             'ignore':
             result_col.count({'security': 1}),
             'risk':
             result_col.count({
                 'security': 0,
                 "desc": {
                     "$exists": True
                 }
             })
         }
         today = {
             'total':
             result_col.count({'timestamp': {
                 '$gte': today_start()
             }}),
             'ignore':
             result_col.count({
                 'timestamp': {
                     '$gte': today_start()
                 },
                 'security': 1
             }),
             'risk':
             result_col.count({
                 'timestamp': {
                     '$gte': today_start()
                 },
                 'security': 0,
                 "desc": {
                     "$exists": True
                 }
             }),
         }
     if setting_col.count({'key': 'task'}):
         status = psutil.pid_exists(
             int(setting_col.find_one({
                 'key': 'task'
             }).get('pid')))
         last = setting_col.find_one({'key': 'task'}).get('last')
     else:
         status = False
         last = 0
     engine = {
         'status': status,
         'last': last,
     }
     result = {'all': total, 'today': today, 'engine': engine}
     data = {'status': 200, 'msg': '获取信息成功', 'result': result}
     return jsonify(data)
Example #6
0
def crawl(query, page):
    mail_notice_list = []
    webhook_notice_list = []
    search_url = 'https://search.gitee.com/?skin=rec&type=code&q={1}&sort=last_indexed' \
                 '&pageno={0}'
    session = gitee_login()

    logger.info('Gitee开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    totalCount = 0
    for page in range(page + 1, page + 2):
        try:
            logger.info("Gitee ------ 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            resp = session.get(search_url.format(page, query.get('keyword')))
            logger.info("Gitee 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            tree = etree.HTML(resp.text)
            nodes = tree.xpath('//*[@id="hits-list"]/div[@class="item"]')
            for node in nodes:
                logger.info("Gitee 开始抓取节点")
                totalCount += 1
                # i = nodes.index(node) + 1
                leakage = {}
                leakage['affect'] = []
                datetime_ = node.xpath(Gitee.DATETIME)[0].text
                # print(datetime)
                datetime_match = re.match("[^\d]*(?P<Date>\d+.*)", datetime_)
                if not datetime_match:
                    leakage['datetime'] = _format_time(
                        datetime.datetime.now().date())
                else:
                    leakage['datetime'] = _format_time(
                        datetime_match.groups("Date")[0])
                leakage['timestamp'] = leakage.get('datetime').timestamp()
                leakage['link'] = cut_tail(
                    node.xpath(Gitee.LINK)[0].attrib['href'])
                leakage['filepath'] = node.xpath(Gitee.LINK)[0].text
                leakage['filename'] = leakage.get("filepath").split("/")[-1]
                # leakage['link'] = 'https://gitee.com' + realative_link
                leakage['_id'] = _md5(leakage['link'])
                logger.info("Gitee ****** 开始抓取节点 {}".format(
                    leakage['datetime']))
                project_username = node.xpath(Gitee.USERNAME)[0].text
                leakage["vendor"] = "GITEE"
                leakage['username'] = project_username.split("/")[0]
                leakage['project'] = project_username
                leakage['project_url'] = cut_tail(
                    node.xpath(Gitee.USERNAME)[0].attrib['href'])
                logger.info("Gitee 抓取到 {}".format(leakage.get("project_url")))

                if result_col.find_one({"link": leakage['link'], "datetime": leakage['datetime']}) or \
                        result_col.find_one({'_id': leakage['_id']}):
                    continue

                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue

                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue

                #gitee中可以只有项目,没有代码
                leakage['avatar_url'] = 'https://gitee.com/logo-black.svg'
                raw_code = gitee_raw_code(leakage['link'])
                leakage['code'] = base64.b64encode(
                    raw_code.encode("utf-8")).decode("utf-8")
                try:
                    leakage['affect'] = get_affect_assets(raw_code)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                leakage['tag'] = query['tag']
                # leakage['detail'] = etree.tostring(node,encoding='unicode').replace('{{', '<<').\
                # replace('}}', '>>')
                language_node = node.xpath(Gitee.LANGUAGE)
                if language_node:
                    leakage['language'] = language_node[0].text.strip()
                else:
                    leakage['language'] = 'Unknow'
                leakage['security'] = 0
                leakage['ignore'] = 0

                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('/')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))

                result_col.insert_one(leakage)
                logger.info("Gitee 抓取到的结果: {}".format(
                    leakage.get("project_url")))
        except Exception as e:
            raise (e)
            print(e)
            logger.error("Gitee error is {}".format(e))
            return
        logger.info('Gitee抓取: tag is {} keyword is {}, page is {} 成功'.format(
            query.get('tag'), query.get('keyword'), page + 1))
        query_col.update_one({'tag': query.get('tag')}, {
            '$set': {
                'last': int(time.time()),
                'status': 1,
                'reason': '抓取第{}页成功'.format(page),
                'api_total': totalCount,
                'found_total': result_col.count({'tag': query.get('tag')})
            }
        })
        if setting_col.count({
                'key': 'mail',
                'enabled': True
        }) and len(mail_notice_list):
            main_content = '<h2>规则名称: {}</h2><br>{}'.format(
                query.get('tag'), '<br>'.join(mail_notice_list))
            send_mail(main_content)
        logger.info(len(webhook_notice_list))
        webhook_notice(query.get('tag'), webhook_notice_list)
Example #7
0
def run():
    # setting_col.update_one({'key': 'task'}, {'$set': {'key': 'task', 'pid': os.getpid()}}, upsert=True)
    query_count = query_col.count({'enabled': True})
    logger.info('需要处理的关键词总数: {}'.format(query_count))
    if query_count:
        logger.info('需要处理的关键词总数: {}'.format(query_count))
    else:
        logger.warning('请添加关键词')
        return
    if github_col.count({'rate_remaining': {'$gt': 5}}):
        pass
    else:
        logger.error('请配置github账号')
        return

    if setting_col.count({'key': 'task', 'page': {'$exists': True}}):
        setting_col.update_one({'key': 'task'}, {'$set': {'pid': os.getpid()}})
        page = int(setting_col.find_one({'key': 'task'}).get('page'))

        for p in range(0, page):
            for query in query_col.find({'enabled': True}).sort('last', 1):
                github_account = random.choice(
                    list(
                        github_col.find({
                            "rate_limit": {
                                "$gt": 5
                            }
                        }).sort('rate_remaining', -1)))
                github_username = github_account.get('username')
                github_password = github_account.get('password')
                rate_remaining = github_account.get('rate_remaining')
                logger.info(github_username)
                logger.info(rate_remaining)
                g = Github(
                    github_username,
                    github_password,
                    per_page=PER_PAGE,
                    user_agent=
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
                )

                # total = query.get('total')
                # if total is None:
                #     repos = g.search_code(query=query.get('keyword'),
                #                           sort="indexed", order="desc")
                #     total = repos.totalCount
                api_total = query.get('api_total')
                if api_total:
                    total = api_total
                else:
                    repos = g.search_code(query=query.get('keyword'),
                                          sort="indexed",
                                          order="desc")
                    total = repos.totalCount
                if total > 1000:
                    total = 1000
                page_pre = int(query.get('page_pre')) if query.get(
                    'page_pre') is not None else -1
                page_all = math.ceil(total / 30)
                if page_all == 0:
                    continue
                if page_pre + 1 >= page_all:
                    page_pre = -1
                page_now = page_pre + 1

                search(query, page_now, g, github_username)

    else:
        logger.error('请在页面上配置任务参数')