Beispiel #1
0
 def post(self):
     parser = reqparse.RequestParser()
     parser.add_argument('keyword', type=str, help='')
     parser.add_argument('tag', type=str, help='')
     parser.add_argument('enabled', type=inputs.boolean, default=True, help='')
     args = parser.parse_args()
     if query_col.count({'tag': args.get('tag')}):
         query_col.update_one({'tag': args.get('tag')}, {'$set': args})
         msg = '更新成功'
     else:
         new_query = args
         new_query['_id'] = md5(''.join([str(v) for v in new_query.values()]))
         query_col.insert_one(new_query)
         msg = '添加成功'
     result = list(query_col.find({}).sort('enabled', -1))
     data = {'status': 200, 'msg': msg, 'result': result}
     return jsonify(data)
Beispiel #2
0
def search(query, page, g, github_username):
    mail_notice_list = []
    webhook_notice_list = []
    logger.info('开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    try:
        repos = g.search_code(query=query.get('keyword'),
                              sort="indexed",
                              order="desc")
        github_col.update_one({'username': github_username}, {
            '$set': {
                'rate_remaining': int(g.get_rate_limit().search.remaining)
            }
        })

    except Exception as error:
        logger.critical(error)
        logger.critical("触发限制啦")
        return
    try:
        for repo in repos.get_page(page):
            setting_col.update_one({'key': 'task'}, {
                '$set': {
                    'key': 'task',
                    'pid': os.getpid(),
                    'last': timestamp()
                }
            },
                                   upsert=True)
            if not result_col.count({'_id': repo.sha}):
                try:
                    code = str(repo.content).replace('\n', '')
                except:
                    code = ''
                leakage = {
                    'link': repo.html_url,
                    'project': repo.repository.full_name,
                    'project_url': repo.repository.html_url,
                    '_id': repo.sha,
                    'language': repo.repository.language,
                    'username': repo.repository.owner.login,
                    'avatar_url': repo.repository.owner.avatar_url,
                    'filepath': repo.path,
                    'filename': repo.name,
                    'security': 0,
                    'ignore': 0,
                    'tag': query.get('tag'),
                    'code': code,
                }
                try:
                    leakage['affect'] = get_affect_assets(repo.decoded_content)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                if int(repo.raw_headers.get('x-ratelimit-remaining')) == 0:
                    logger.critical('剩余使用次数: {}'.format(
                        repo.raw_headers.get('x-ratelimit-remaining')))
                    return
                last_modified = datetime.datetime.strptime(
                    repo.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
                leakage['datetime'] = last_modified
                leakage['timestamp'] = last_modified.timestamp()
                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue
                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue
                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('.')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))
                try:
                    result_col.insert_one(leakage)
                    logger.info(leakage.get('project'))
                except errors.DuplicateKeyError:
                    logger.info('已存在')

                logger.info('抓取关键字:{} {}'.format(query.get('tag'),
                                                 leakage.get('link')))
    except Exception as error:
        if 'Not Found' not in error.data:
            g, github_username = new_github()
            search.schedule(args=(query, page, g, github_username),
                            delay=huey.pending_count() +
                            huey.scheduled_count())
        logger.critical(error)
        logger.error('抓取: tag is {} keyword is {}, page is {} 失败'.format(
            query.get('tag'), query.get('keyword'), page + 1))

        return
    logger.info('抓取: tag is {} keyword is {}, page is {} 成功'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    query_col.update_one({'tag': query.get('tag')}, {
        '$set': {
            'last': int(time.time()),
            'status': 1,
            'reason': '抓取第{}页成功'.format(page),
            'api_total': repos.totalCount,
            'found_total': result_col.count({'tag': query.get('tag')})
        }
    })
    if setting_col.count({
            'key': 'mail',
            'enabled': True
    }) and len(mail_notice_list):
        main_content = '<h2>规则名称: {}</h2><br>{}'.format(
            query.get('tag'), '<br>'.join(mail_notice_list))
        send_mail(main_content)
    logger.info(len(webhook_notice_list))
    webhook_notice(query.get('tag'), webhook_notice_list)
Beispiel #3
0
def crawl(query, page):
    mail_notice_list = []
    webhook_notice_list = []
    search_url = 'https://search.gitee.com/?skin=rec&type=code&q={1}&sort=last_indexed' \
                 '&pageno={0}'
    session = gitee_login()

    logger.info('Gitee开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    totalCount = 0
    for page in range(page + 1, page + 2):
        try:
            logger.info("Gitee ------ 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            resp = session.get(search_url.format(page, query.get('keyword')))
            logger.info("Gitee 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            tree = etree.HTML(resp.text)
            nodes = tree.xpath('//*[@id="hits-list"]/div[@class="item"]')
            for node in nodes:
                logger.info("Gitee 开始抓取节点")
                totalCount += 1
                # i = nodes.index(node) + 1
                leakage = {}
                leakage['affect'] = []
                datetime_ = node.xpath(Gitee.DATETIME)[0].text
                # print(datetime)
                datetime_match = re.match("[^\d]*(?P<Date>\d+.*)", datetime_)
                if not datetime_match:
                    leakage['datetime'] = _format_time(
                        datetime.datetime.now().date())
                else:
                    leakage['datetime'] = _format_time(
                        datetime_match.groups("Date")[0])
                leakage['timestamp'] = leakage.get('datetime').timestamp()
                leakage['link'] = cut_tail(
                    node.xpath(Gitee.LINK)[0].attrib['href'])
                leakage['filepath'] = node.xpath(Gitee.LINK)[0].text
                leakage['filename'] = leakage.get("filepath").split("/")[-1]
                # leakage['link'] = 'https://gitee.com' + realative_link
                leakage['_id'] = _md5(leakage['link'])
                logger.info("Gitee ****** 开始抓取节点 {}".format(
                    leakage['datetime']))
                project_username = node.xpath(Gitee.USERNAME)[0].text
                leakage["vendor"] = "GITEE"
                leakage['username'] = project_username.split("/")[0]
                leakage['project'] = project_username
                leakage['project_url'] = cut_tail(
                    node.xpath(Gitee.USERNAME)[0].attrib['href'])
                logger.info("Gitee 抓取到 {}".format(leakage.get("project_url")))

                if result_col.find_one({"link": leakage['link'], "datetime": leakage['datetime']}) or \
                        result_col.find_one({'_id': leakage['_id']}):
                    continue

                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue

                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue

                #gitee中可以只有项目,没有代码
                leakage['avatar_url'] = 'https://gitee.com/logo-black.svg'
                raw_code = gitee_raw_code(leakage['link'])
                leakage['code'] = base64.b64encode(
                    raw_code.encode("utf-8")).decode("utf-8")
                try:
                    leakage['affect'] = get_affect_assets(raw_code)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                leakage['tag'] = query['tag']
                # leakage['detail'] = etree.tostring(node,encoding='unicode').replace('{{', '<<').\
                # replace('}}', '>>')
                language_node = node.xpath(Gitee.LANGUAGE)
                if language_node:
                    leakage['language'] = language_node[0].text.strip()
                else:
                    leakage['language'] = 'Unknow'
                leakage['security'] = 0
                leakage['ignore'] = 0

                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('/')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))

                result_col.insert_one(leakage)
                logger.info("Gitee 抓取到的结果: {}".format(
                    leakage.get("project_url")))
        except Exception as e:
            raise (e)
            print(e)
            logger.error("Gitee error is {}".format(e))
            return
        logger.info('Gitee抓取: tag is {} keyword is {}, page is {} 成功'.format(
            query.get('tag'), query.get('keyword'), page + 1))
        query_col.update_one({'tag': query.get('tag')}, {
            '$set': {
                'last': int(time.time()),
                'status': 1,
                'reason': '抓取第{}页成功'.format(page),
                'api_total': totalCount,
                'found_total': result_col.count({'tag': query.get('tag')})
            }
        })
        if setting_col.count({
                'key': 'mail',
                'enabled': True
        }) and len(mail_notice_list):
            main_content = '<h2>规则名称: {}</h2><br>{}'.format(
                query.get('tag'), '<br>'.join(mail_notice_list))
            send_mail(main_content)
        logger.info(len(webhook_notice_list))
        webhook_notice(query.get('tag'), webhook_notice_list)
Beispiel #4
0
def check():
    setting_col.update_one({'key': 'task'},
                           {'$set': {
                               'key': 'task',
                               'pid': os.getpid()
                           }},
                           upsert=True)
    query_count = query_col.count({'enabled': True})
    logger.info('需要处理的关键词总数: {}'.format(query_count))
    if query_count:
        logger.info('需要处理的关键词总数: {}'.format(query_count))
    else:
        logger.warning('请添加关键词')
        return
    if github_col.count({'rate_remaining': {'$gt': 5}}):
        pass
    else:
        logger.error('请配置github账号')
        return

    if setting_col.count({'key': 'task', 'page': {'$exists': True}}):
        setting_col.update_one({'key': 'task'}, {'$set': {'pid': os.getpid()}})
        page = int(setting_col.find_one({'key': 'task'}).get('page'))

        for p in range(0, page):
            for query in query_col.find({
                    'enabled': True
            }).sort('last', ASCENDING):
                github_account = random.choice(
                    list(
                        github_col.find({
                            "rate_limit": {
                                "$gt": 5
                            }
                        }).sort('rate_remaining', DESCENDING)))
                github_username = github_account.get('username')
                github_password = github_account.get('password')
                rate_remaining = github_account.get('rate_remaining')
                logger.info(github_username)
                logger.info(rate_remaining)
                g = Github(
                    github_username,
                    github_password,
                    per_page=PER_PAGE,
                    user_agent=
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
                )

                # total = query.get('total')
                # if total is None:
                #     repos = g.search_code(query=query.get('keyword'),
                #                           sort="indexed", order="desc")
                #     total = repos.totalCount
                api_total = query.get('api_total')
                if api_total:
                    total = api_total
                else:
                    repos = g.search_code(query=query.get('keyword'),
                                          sort="indexed",
                                          order="desc")
                    total = repos.totalCount
                page_pre = int(query.get('page_pre')) if query.get(
                    'page_pre') is not None else -1
                total = total if total <= 1000 else 1000
                page_all = math.ceil(total / PER_PAGE)
                if page_all == 0:
                    continue
                if page_pre + 1 >= page_all:
                    page_pre = -1
                page_now = page_pre + 1

                query_col.update_one({'tag': query.get('tag')},
                                     {'$set': {
                                         'page_pre': page_now
                                     }})

                search.schedule(args=(query, page_now, g, github_username),
                                delay=huey.pending_count() +
                                huey.scheduled_count())
    else:
        logger.error('请在页面上配置任务参数')