Exemple #1
0
def search(query, page, g, github_username):
    mail_notice_list = []
    webhook_notice_list = []
    logger.info('开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    try:
        repos = g.search_code(query=query.get('keyword'),
                              sort="indexed",
                              order="desc")
        github_col.update_one({'username': github_username}, {
            '$set': {
                'rate_remaining': int(g.get_rate_limit().search.remaining)
            }
        })

    except Exception as error:
        logger.critical(error)
        logger.critical("触发限制啦")
        return
    try:
        for repo in repos.get_page(page):
            setting_col.update_one({'key': 'task'}, {
                '$set': {
                    'key': 'task',
                    'pid': os.getpid(),
                    'last': timestamp()
                }
            },
                                   upsert=True)
            if not result_col.count({'_id': repo.sha}):
                try:
                    code = str(repo.content).replace('\n', '')
                except:
                    code = ''
                leakage = {
                    'link': repo.html_url,
                    'project': repo.repository.full_name,
                    'project_url': repo.repository.html_url,
                    '_id': repo.sha,
                    'language': repo.repository.language,
                    'username': repo.repository.owner.login,
                    'avatar_url': repo.repository.owner.avatar_url,
                    'filepath': repo.path,
                    'filename': repo.name,
                    'security': 0,
                    'ignore': 0,
                    'tag': query.get('tag'),
                    'code': code,
                }
                try:
                    leakage['affect'] = get_affect_assets(repo.decoded_content)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                if int(repo.raw_headers.get('x-ratelimit-remaining')) == 0:
                    logger.critical('剩余使用次数: {}'.format(
                        repo.raw_headers.get('x-ratelimit-remaining')))
                    return
                last_modified = datetime.datetime.strptime(
                    repo.last_modified, '%a, %d %b %Y %H:%M:%S %Z')
                leakage['datetime'] = last_modified
                leakage['timestamp'] = last_modified.timestamp()
                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue
                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue
                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('.')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))
                try:
                    result_col.insert_one(leakage)
                    logger.info(leakage.get('project'))
                except errors.DuplicateKeyError:
                    logger.info('已存在')

                logger.info('抓取关键字:{} {}'.format(query.get('tag'),
                                                 leakage.get('link')))
    except Exception as error:
        if 'Not Found' not in error.data:
            g, github_username = new_github()
            search.schedule(args=(query, page, g, github_username),
                            delay=huey.pending_count() +
                            huey.scheduled_count())
        logger.critical(error)
        logger.error('抓取: tag is {} keyword is {}, page is {} 失败'.format(
            query.get('tag'), query.get('keyword'), page + 1))

        return
    logger.info('抓取: tag is {} keyword is {}, page is {} 成功'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    query_col.update_one({'tag': query.get('tag')}, {
        '$set': {
            'last': int(time.time()),
            'status': 1,
            'reason': '抓取第{}页成功'.format(page),
            'api_total': repos.totalCount,
            'found_total': result_col.count({'tag': query.get('tag')})
        }
    })
    if setting_col.count({
            'key': 'mail',
            'enabled': True
    }) and len(mail_notice_list):
        main_content = '<h2>规则名称: {}</h2><br>{}'.format(
            query.get('tag'), '<br>'.join(mail_notice_list))
        send_mail(main_content)
    logger.info(len(webhook_notice_list))
    webhook_notice(query.get('tag'), webhook_notice_list)
Exemple #2
0
def crawl(query, page):
    mail_notice_list = []
    webhook_notice_list = []
    search_url = 'https://search.gitee.com/?skin=rec&type=code&q={1}&sort=last_indexed' \
                 '&pageno={0}'
    session = gitee_login()

    logger.info('Gitee开始抓取: tag is {} keyword is {}, page is {}'.format(
        query.get('tag'), query.get('keyword'), page + 1))
    totalCount = 0
    for page in range(page + 1, page + 2):
        try:
            logger.info("Gitee ------ 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            resp = session.get(search_url.format(page, query.get('keyword')))
            logger.info("Gitee 启动抓取: {}".format(
                search_url.format(page, query.get('keyword'))))
            tree = etree.HTML(resp.text)
            nodes = tree.xpath('//*[@id="hits-list"]/div[@class="item"]')
            for node in nodes:
                logger.info("Gitee 开始抓取节点")
                totalCount += 1
                # i = nodes.index(node) + 1
                leakage = {}
                leakage['affect'] = []
                datetime_ = node.xpath(Gitee.DATETIME)[0].text
                # print(datetime)
                datetime_match = re.match("[^\d]*(?P<Date>\d+.*)", datetime_)
                if not datetime_match:
                    leakage['datetime'] = _format_time(
                        datetime.datetime.now().date())
                else:
                    leakage['datetime'] = _format_time(
                        datetime_match.groups("Date")[0])
                leakage['timestamp'] = leakage.get('datetime').timestamp()
                leakage['link'] = cut_tail(
                    node.xpath(Gitee.LINK)[0].attrib['href'])
                leakage['filepath'] = node.xpath(Gitee.LINK)[0].text
                leakage['filename'] = leakage.get("filepath").split("/")[-1]
                # leakage['link'] = 'https://gitee.com' + realative_link
                leakage['_id'] = _md5(leakage['link'])
                logger.info("Gitee ****** 开始抓取节点 {}".format(
                    leakage['datetime']))
                project_username = node.xpath(Gitee.USERNAME)[0].text
                leakage["vendor"] = "GITEE"
                leakage['username'] = project_username.split("/")[0]
                leakage['project'] = project_username
                leakage['project_url'] = cut_tail(
                    node.xpath(Gitee.USERNAME)[0].attrib['href'])
                logger.info("Gitee 抓取到 {}".format(leakage.get("project_url")))

                if result_col.find_one({"link": leakage['link'], "datetime": leakage['datetime']}) or \
                        result_col.find_one({'_id': leakage['_id']}):
                    continue

                in_blacklist = False
                for blacklist in blacklist_col.find({}):
                    if blacklist.get('text').lower() in leakage.get(
                            'link').lower():
                        logger.warning('{} 包含白名单中的 {}'.format(
                            leakage.get('link'), blacklist.get('text')))
                        in_blacklist = True
                if in_blacklist:
                    continue

                if result_col.count({
                        "project": leakage.get('project'),
                        "ignore": 1
                }):
                    continue

                #gitee中可以只有项目,没有代码
                leakage['avatar_url'] = 'https://gitee.com/logo-black.svg'
                raw_code = gitee_raw_code(leakage['link'])
                leakage['code'] = base64.b64encode(
                    raw_code.encode("utf-8")).decode("utf-8")
                try:
                    leakage['affect'] = get_affect_assets(raw_code)
                except Exception as error:
                    logger.critical('{} {}'.format(error, leakage.get('link')))
                    leakage['affect'] = []
                leakage['tag'] = query['tag']
                # leakage['detail'] = etree.tostring(node,encoding='unicode').replace('{{', '<<').\
                # replace('}}', '>>')
                language_node = node.xpath(Gitee.LANGUAGE)
                if language_node:
                    leakage['language'] = language_node[0].text.strip()
                else:
                    leakage['language'] = 'Unknow'
                leakage['security'] = 0
                leakage['ignore'] = 0

                if not result_col.count({
                        "project": leakage.get('project'),
                        "filepath": leakage.get("filepath"),
                        "security": 0
                }):
                    mail_notice_list.append(
                        '上传时间:{} 地址: <a href={}>{}/{}</a>'.format(
                            leakage.get('datetime'), leakage.get('link'),
                            leakage.get('project'), leakage.get('filename')))
                    webhook_notice_list.append('[{}/{}]({}) 上传于 {}'.format(
                        leakage.get('project').split('/')[-1],
                        leakage.get('filename'), leakage.get('link'),
                        leakage.get('datetime')))

                result_col.insert_one(leakage)
                logger.info("Gitee 抓取到的结果: {}".format(
                    leakage.get("project_url")))
        except Exception as e:
            raise (e)
            print(e)
            logger.error("Gitee error is {}".format(e))
            return
        logger.info('Gitee抓取: tag is {} keyword is {}, page is {} 成功'.format(
            query.get('tag'), query.get('keyword'), page + 1))
        query_col.update_one({'tag': query.get('tag')}, {
            '$set': {
                'last': int(time.time()),
                'status': 1,
                'reason': '抓取第{}页成功'.format(page),
                'api_total': totalCount,
                'found_total': result_col.count({'tag': query.get('tag')})
            }
        })
        if setting_col.count({
                'key': 'mail',
                'enabled': True
        }) and len(mail_notice_list):
            main_content = '<h2>规则名称: {}</h2><br>{}'.format(
                query.get('tag'), '<br>'.join(mail_notice_list))
            send_mail(main_content)
        logger.info(len(webhook_notice_list))
        webhook_notice(query.get('tag'), webhook_notice_list)