class CrawlBadips(Task):
    def __init__(self):
        super().__init__('badips 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.badips.com/get/list/any/2?age=7d'
        source = 'www.badips.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.strip()
            block = [ip, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("badips 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("badips 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
Beispiel #2
0
 def __init__(self, name, time=None):
     super().__init__()
     self.logger = ContextLogger('task')
     self.name = name
     self.time = time
     self.cookies = {}
     self.headers = {}
class CrawlBambenekconsulting(Task):
    def __init__(self):
        super().__init__('bambenekconsulting 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []

        url = 'http://osint.bambenekconsulting.com/feeds/c2-dommasterlist-high.txt'
        source = 'osint.bambenekconsulting.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _domain in info:
            if _domain.startswith('#'): continue
            if _domain == '': continue
            domain = _domain.split(',')[0]
            description = re.search('Domain used by (.*)', _domain.split(',')[1]).group(1)
            updatetime = _domain.split(',')[2].split(' ')[0]
            block = [domain, updatetime, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0

        if len(domains) > 0:
            try:
                for domain, updatetime, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.updatetime = updatetime
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = updatetime
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("bambenekconsulting 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("bambenekconsulting 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
Beispiel #4
0
 def __init__(self):
     super().__init__('LaGou数据爬取')
     self.logger = ContextLogger('task_lagou')
     self.is_crawl = False
     self.headers = {
         'Host': 'www.lagou.com',
         'Upgrade - Insecure - Requests': '1',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Accept-Encoding': 'gzip, deflate',
         'Referer':
         'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
         'Connection': 'keep-alive',
         'Cookie':
         'user_trace_token=20171103191801-9206e24f-9ca2-40ab-95a3-23947c0b972a; _ga=GA1.2.545192972.1509707889; LGUID=20171103191805-a9838dac-c088-11e7-9704-5254005c3644; JSESSIONID=ABAAABAACDBABJB2EE720304E451B2CEFA1723CE83F19CC; _gat=1; LGSID=20171228225143-9edb51dd-ebde-11e7-b670-525400f775ce; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKkJPgBHAnny1nUKaLpx2oDfUXv9ItIF3kBAWM2-fDNu%26ck%3D3065.1.126.376.140.374.139.129%26shh%3Dwww.baidu.com%26sht%3Dmonline_3_dg%26wd%3D%26eqid%3Db0ec59d100013c7f000000055a4504f6; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171228225224-b6cc7abd-ebde-11e7-9f67-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; SEARCH_ID=3ec21cea985a4a5fa2ab279d868560c8',
         'X-Requested-With': 'XMLHttpRequest',
         'X-Anit-Forge-Token': 'None',
         'X-Anit-Forge-Code': '0',
         'Pragma': 'no-cache',
         'Cache-Control': 'no-cache',
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
     }
class CrawlMaxmind(Task):
    def __init__(self):
        super().__init__('maxmind 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.maxmind.com/en/high-risk-ip-sample-list'
        source = 'www.maxmind.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _info)
        description = 'Higher Risk IP'
        for ip in _ips:
            block = [ip, description, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("maxmind 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("maxmind  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
Beispiel #6
0
class CrawlMalwaredomains(Task):
    def __init__(self):
        super().__init__('malwaredomains 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://mirror4.malwaredomains.com/files/justdomains'
        source = 'mirror4.malwaredomains.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for domain in info:
            block = [domain, source]
            domains.append(block)

        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("malwaredomains 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("malwaredomains 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlGithubusercontent(Task):
    def __init__(self):
        super().__init__('githubusercontent 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        urls = ['https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/botscout_1d.ipset',
                'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset'
                ]
        source = 'raw.githubusercontent.com'
        for url in urls:
            _info = self.get(url=url)
            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for _ip in info:
                if _ip.startswith('#'): continue
                if _ip == '': continue
                ip = _ip.strip()
                block = [ip, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("githubusercontent 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("githubusercontent 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Crawlemergingthreats(Task):
    def __init__(self):
        super().__init__('emergingthreats 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://rules.emergingthreats.net/open/suricata/rules/botcc.rules'
        source = 'rules.emergingthreats.net'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        blocks = re.findall(r'(\[.*\))', _info)
        for block in blocks:
            _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', block)
            description = re.search('classtype:(.*?);', block).group(1)
            for ip in _ips:
                block = [ip, description, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("emergingthreats 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("emergingthreats 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlCybersweat(Task):
    def __init__(self):
        super().__init__('cybersweat 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://cybersweat.shop/iprep/iprep_ramnode.txt' # ERROR
        source = 'cybersweat.shop'
        proxies = {'http': 'socks5://127.0.0.1:1080', 'https': 'socks5://127.0.0.1:1080'}
        _info = self.get(url=url, proxies=proxies)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split(';')[0].strip()
            updatetime = _ip.split(';')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("cybersweat 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("cybersweat 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
Beispiel #10
0
class CrawlRansomwaretracker(Task):
    def __init__(self):
        super().__init__('ransomwaretracker 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt'
        source = 'ransomwaretracker.abuse.ch'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        description = 'Ransomware'
        for domain in info:
            if domain.startswith('#'): continue
            if domain == '': continue
            block = [domain, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):
        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = _time
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("ransomwaretracker 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("ransomwaretracker 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
Beispiel #11
0
class BaseReq:
    def __init__(self, is_crawl=True):
        self.ses = db.session
        self.is_crawl = is_crawl
        self.logger = ContextLogger('crawl')

    def _request(self, url, method='post', timeout=20, retry=5, **kwargs):
        if kwargs.get('headers'):
            headers = kwargs['headers']
        else:
            headers = {}
        if kwargs.get('cookies'):
            cookies = kwargs['cookies']
        else:
            cookies = {}

        try:
            resp = requests.request(method,
                                    '{}'.format(url),
                                    timeout=timeout,
                                    headers=headers,
                                    cookies=cookies,
                                    **kwargs)
        except Exception as e:
            self.logger.warning(e)
            if retry > 0:
                return self._request(url,
                                     method,
                                     timeout,
                                     retry=retry - 1,
                                     **kwargs)
            else:
                return None
        if resp.status_code != 200 and retry > 0:
            return self._request(url,
                                 method,
                                 timeout,
                                 retry=retry - 1,
                                 **kwargs)
        if self.is_crawl:
            return resp.text
        else:
            try:
                data = resp.json()
            except Exception as e:
                self.logger.warning(e)
                data = None
            return data

    def get(self, url, **kwargs):
        return self._request(url, method='get', **kwargs)

    def post(self, url, **kwargs):
        return self._request(url, method='post', **kwargs)
Beispiel #12
0
 def __init__(self, name, time=None):
     super().__init__()
     self.logger = ContextLogger('task')
     self.name = name
     self.time = time
     self.header = {
         'Accept-Language':
         'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101'
     }
     self.cookies = {}
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('sync_sqlite')
        self.name = name
        self.time = time

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def process(self):
        pass

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info('同步成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def run(self):
        self.start()
        self.process()
        self.finish()
Beispiel #14
0
 def __init__(self, is_crawl=True):
     self.ses = db.session
     self.is_crawl = is_crawl
     self.logger = ContextLogger('crawl')
     self.headers = {
         'Host': 'www.lagou.com',
         'Upgrade - Insecure - Requests': '1',
         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Accept-Encoding': 'gzip, deflate',
         'Referer': 'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
         'Connection': 'keep-alive',
         'Cache-Control': 'max-age=0',
         'Content-Type': 'multipart/form-data'
     }
Beispiel #15
0
 def __init__(self, name, time=None):
     super().__init__()
     self.logger = ContextLogger('task')
     self.name = name
     self.time = time
     self.headers = {
         'Upgrade - Insecure - Requests': '1',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Accept-Encoding': 'gzip, deflate',
         'Referer': 'http://www.baidu.com',
         'Connection': 'keep-alive',
         'Cache-Control': 'max-age=0',
     }
     self.cookies = {}
Beispiel #16
0
class BaseReq:
    def __init__(self, is_crawl=True):
        self.ses = db.session
        self.is_crawl = is_crawl
        self.logger = ContextLogger('crawl')
        self.headers = {
            'Host': 'www.lagou.com',
            'Upgrade - Insecure - Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Content-Type': 'multipart/form-data'
        }

    def _request(self, url, method='post', timeout=20, retry=5, **kwargs):

        if kwargs.get('cookies'):
            cookies = kwargs['cookies']
        else:
            cookies = {}
        try:
            resp = requests.request(method, '{}'.format(url), timeout=timeout, headers=self.headers,
                                    cookies=cookies, **kwargs)
        except Exception as e:
            if retry > 0:
                return self._request(url, method, timeout, retry=retry-1, **kwargs)
            else:
                self.logger.warning('请求失败', e)
                return None
        if resp.status_code != 200 and retry > 0:
            return self._request(url, method, timeout, retry=retry-1, **kwargs)
        if self.is_crawl:

            return resp.text
        else:
            try:
                data = resp.json()
                if (data['success'] is False or data is None) and retry > 0:
                    time.sleep(3)
                    return self._request(url, method, timeout, retry=retry - 1, **kwargs)
            except Exception as e:
                if retry > 0:
                    return self._request(url, method, timeout, retry=retry - 1, **kwargs)
                else:
                    self.logger.warning('解析失败', e)
                    self.logger.warning(url)
                    data = None
            return data

    def get(self, url, **kwargs):
        return self._request(url, method='get', **kwargs)

    def post(self, url, **kwargs):
        return self._request(url, method='post', **kwargs)
Beispiel #17
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.headers = {
            'Upgrade - Insecure - Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'http://www.baidu.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()
Beispiel #18
0
 def __init__(self):
     super().__init__('rulez 数据爬取')
     self.logger = ContextLogger('threat_ip')
Beispiel #19
0
class CrawlRulez(Task):
    def __init__(self):
        super().__init__('rulez 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://danger.rulez.sk/projects/bruteforceblocker/blist.php'
        source = 'danger.rulez.sk'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split('#')[0].strip()
            updatetime = _ip.split('#')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("rulez 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("rulez  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
Beispiel #20
0
 def __init__(self):
     super().__init__('malwaredomains 数据爬取')
     self.logger = ContextLogger('threat_domain')
 def __init__(self):
     super().__init__('cnnvd数据爬取')
     self.logger = ContextLogger('task_cnnvd')
class CrawlCnnvd(Task):
    def __init__(self):
        super().__init__('cnnvd数据爬取')
        self.logger = ContextLogger('task_cnnvd')

    def get_api_id(self, cnnvd_str):
        str_1, str_2, str_3 = cnnvd_str.split('-')
        if len(str_3) == 3:
            return str_2 + '0' + str_3
        return str_2 + str_3

    def api_affect_product(self, id, name_str):
        data = {'cvCveid': id, 'counts': 5}
        json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag',
                               data=data)
        if json_datas == '' or json_datas is None:
            return name_str
        else:
            json_datas = json.loads(json_datas)
            for json_data in json_datas:
                name_str = name_str + json_data['cpr_product_name'] + ';'
            return name_str

    def api_patchs(self, id, patch_dict):
        data = {'cvCveid': id, 'counts': 5}
        json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag',
                               data=data)
        if json_datas == '' or json_datas is None:
            return patch_dict
        else:
            json_datas = json.loads(json_datas)
            for json_data in json_datas:
                patch_name = json_data['cp_cname']
                patch_url = 'http://www.cnnvd.org.cn' + '/web/xxk/bdxqById.tag?id=' + str(
                    json_data['cp_id'])
                patch_dict[patch_name] = patch_url
            return patch_dict

    def run_crawl(self):
        begin_base_url = 'http://www.cnnvd.org.cn' + '/web/vulnerability/querylist.tag?pageno='
        end_base_url = '&repairLd='
        index = 1
        while index < 10597:
            next_page = begin_base_url + str(index) + end_base_url
            print(next_page)
            html = self.get_one_page(next_page)
            if html:
                result = self.handle_list_html(html)
                if result:
                    break
            index += 1

    def handle_list_html(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for url in soup.find(class_='list_list').ul.find_all('li'):
            vul_url = 'http://www.cnnvd.org.cn' + url.div.a['href']

            html = self.get_one_page(vul_url)
            if html:
                result = self.handle_info_html(html, vul_url)
                if result:
                    return result
        return False

    def handle_info_html(self, html, url):
        soup = BeautifulSoup(html, 'lxml')
        try:
            simple_list = []
            title = soup.find(
                class_="detail_xq w770").find('h2').get_text().strip()
            cnnvd_id = soup.find(class_="detail_xq w770").find('ul').find_all(
                'li')[0].span.get_text().strip().split(':')[1]
            for li in soup.find(
                    class_="detail_xq w770").find('ul').find_all('li')[1:]:
                if li.a:
                    simple_list.append(li.a.get_text().strip())

            # 如果时间超过一个月, 返回True,暂停爬取
            if simple_list[5] < end_time:
                return True

            vulner_source = soup.find(class_="detail_xq w770").find(
                'ul').find_all('li')[-1].span.get_text().strip().split(':')[1]
            list = []
            for div in soup.find(class_="fl w770").find_all(class_='d_ldjj'):
                str_value = ''
                for p in div.find_all('p'):
                    str_value += p.get_text().strip()
                list.append(str_value)
            if list[3] != '暂无':
                str_value = ''
                affect_products = soup.find(class_="fl w770").find_all(
                    class_='d_ldjj')[3].find_all(class_='a_title2')
                for i in affect_products:
                    str_value += i.get_text().strip() + ';'
                if len(affect_products) == 5:
                    str_value = self.api_affect_product(
                        self.get_api_id(cnnvd_id), str_value)
                list[3] = str_value
            if list[4] != '暂无':
                patch_dict = {}
                patchs = soup.find(class_="fl w770").find_all(
                    class_='d_ldjj')[4].find_all(class_='a_title2')
                for i in patchs:
                    name = i.get_text().strip()
                    patch_url = 'http://www.cnnvd.org.cn' + i['href']
                    patch_dict[name] = patch_url
                if len(patchs) == 5:
                    patch_dict = self.api_patchs(self.get_api_id(cnnvd_id),
                                                 patch_dict)
                list[4] = patch_dict
        except Exception as e:
            self.logger.warning(url + '--' + str(e))
            return False

        vul = db.session.query(Vulner).filter_by(cnnvd_id=cnnvd_id).first()
        if vul:
            print("查询成功")
        else:
            print("新数据")
            vul = Vulner()
        vul.title = title
        vul.vulner_source = vulner_source
        vul.cnnvd_id = cnnvd_id
        vul.url = url
        vul.level = simple_list[0]
        vul.cve_id = simple_list[1]
        vul.vulner_type = simple_list[2]
        vul.posted_time = simple_list[3]
        vul.threats_type = simple_list[4]
        vul.update_time = simple_list[5]
        vul.describe = list[0]
        vul.source = 'cnnvd'
        vul.solve_way = list[1]
        vul.refer_link = list[2]
        vul.affect_product = list[3]
        vul.patch = list[4]

        print(vul.title, '\n', vul.update_time, '\n', vul.cve_id, '\n',
              vul.url, '\n\n')

        self.safe_commit(vul)

        return False
Beispiel #23
0
from app.engines import db
from app.logger import ContextLogger
from app.models.patch import Patch

logger = ContextLogger('task_cnnvd')


def get_patch(title):
    try:
        patch = db.session.query(Patch).filter(Patch.title == title).first()
        return patch
    except Exception as e:
        logger.warning(e)
        return False


if __name__ == '__main__':
    print(get_patch('firefox-3.6.9.tests'))
Beispiel #24
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.cookies = {}
        self.headers = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.job_type + value.createTime + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html, tag):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url, tag):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass
 def __init__(self, name, time=None):
     super().__init__()
     self.logger = ContextLogger('sync_sqlite')
     self.name = name
     self.time = time
from app.engines import db
from app.models import Show
from datetime import datetime, timedelta
from app.logger import ContextLogger

logger = ContextLogger('show')


def calc_ip_show_update_count(count):
    format_time = datetime.now().strftime("%Y-%m-%d")
    show = db.session.query(Show).filter_by(format_time=format_time,
                                            type='ip').first()
    if show:
        show.update_of_count = show.update_of_count + count
    else:
        try:
            show = Show()
            show.type = 'ip'
            show.update_of_count = count
        except Exception as e:
            logger.warning("计算展示数据更新次数出错" + str(e))

    safe_commit(show)


def calc_domain_show_update_count(count):
    format_time = datetime.now().strftime("%Y-%m-%d")
    show = db.session.query(Show).filter_by(format_time=format_time,
                                            type='domain').first()
    if show:
        show.update_of_count = show.update_of_count + count
Beispiel #27
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.header = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101'
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass

    def is_ip(self, _str):
        p = re.compile(
            '^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$'
        )
        if p.match(_str):
            return True
        else:
            return False
Beispiel #28
0
 def __init__(self):
     super().__init__('zeustracker 数据爬取')
     self.logger = ContextLogger('threat_domain')
Beispiel #29
0
 def __init__(self, is_crawl=True):
     self.ses = db.session
     self.is_crawl = is_crawl
     self.logger = ContextLogger('crawl')
Beispiel #30
0
class CrawlFreebuf(Task):
    def __init__(self):
        super().__init__('LaGou数据爬取')
        self.logger = ContextLogger('task_lagou')
        self.is_crawl = False
        self.headers = {
            'Host': 'www.lagou.com',
            'Upgrade - Insecure - Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer':
            'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
            'Connection': 'keep-alive',
            'Cookie':
            'user_trace_token=20171103191801-9206e24f-9ca2-40ab-95a3-23947c0b972a; _ga=GA1.2.545192972.1509707889; LGUID=20171103191805-a9838dac-c088-11e7-9704-5254005c3644; JSESSIONID=ABAAABAACDBABJB2EE720304E451B2CEFA1723CE83F19CC; _gat=1; LGSID=20171228225143-9edb51dd-ebde-11e7-b670-525400f775ce; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKkJPgBHAnny1nUKaLpx2oDfUXv9ItIF3kBAWM2-fDNu%26ck%3D3065.1.126.376.140.374.139.129%26shh%3Dwww.baidu.com%26sht%3Dmonline_3_dg%26wd%3D%26eqid%3Db0ec59d100013c7f000000055a4504f6; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171228225224-b6cc7abd-ebde-11e7-9f67-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; SEARCH_ID=3ec21cea985a4a5fa2ab279d868560c8',
            'X-Requested-With': 'XMLHttpRequest',
            'X-Anit-Forge-Token': 'None',
            'X-Anit-Forge-Code': '0',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

    def process(self, page, tag):

        base_url = 'https://www.lagou.com/jobs/positionAjax.json?'
        params = {
            'px': 'new',
            'needAddtionalResult': 'false',
            'isSchoolJob': '0'
        }
        data = {'first': 'false', 'pn': str(page), 'kd': tag}
        html = self.post(base_url, data=data, params=params)

        if html:
            try:
                count = int(html['content']['positionResult']['resultSize'])
                job_list = html['content']['positionResult']['result']
                if count == 0:
                    return True
                for job in job_list:
                    self.storage(job, tag)
            except Exception as e:
                self.logger.warning('取值失败', e)
                self.logger.warning(base_url)

    def storage(self, job_dict, tag):

        for key in job_dict:
            job_dict[key] = self.switch_str(job_dict[key])
        job = Job()
        try:
            job.companyId = job_dict['companyId']
            job.positionName = job_dict['positionName']
            job.workYear = job_dict['workYear']
            job.education = job_dict['education']
            job.jobNature = job_dict['jobNature']
            job.companyLogo = job_dict['companyLogo']
            job.salary = job_dict['salary']
            job.city = job_dict['city']
            job.financeStage = job_dict['financeStage']
            job.industryField = job_dict['industryField']
            job.positionId = job_dict['positionId']
            job.approve = job_dict['approve']
            job.createTime = job_dict['createTime']
            job.positionAdvantage = job_dict['positionAdvantage']
            job.companySize = job_dict['companySize']
            job.companyLabelList = job_dict['companyLabelList']
            job.publisherId = job_dict['publisherId']
            job.score = job_dict['score']
            job.district = job_dict['district']
            job.companyShortName = job_dict['companyShortName']
            job.positionLables = job_dict['positionLables']
            job.industryLables = job_dict['industryLables']
            job.businessZones = job_dict['businessZones']
            job.longitude = job_dict['longitude']
            job.latitude = job_dict['latitude']
            job.adWord = job_dict['adWord']
            job.formatCreateTime = job_dict['formatCreateTime']
            job.hitags = job_dict['hitags']
            job.resumeProcessRate = job_dict['resumeProcessRate']
            job.resumeProcessDay = job_dict['resumeProcessDay']
            job.companyFullName = job_dict['companyFullName']
            job.imState = job_dict['imState']
            job.lastLogin = job_dict['lastLogin']
            job.explain = job_dict['explain']
            job.plus = job_dict['plus']
            job.pcShow = job_dict['pcShow']
            job.appShow = job_dict['appShow']
            job.deliver = job_dict['deliver']
            job.gradeDescription = job_dict['gradeDescription']
            job.promotionScoreExplain = job_dict['promotionScoreExplain']
            job.firstType = job_dict['firstType']
            job.secondType = job_dict['secondType']
            job.isSchoolJob = job_dict['isSchoolJob']
            job.subwayline = job_dict['subwayline']
            job.stationname = job_dict['stationname']
            job.linestaion = job_dict['linestaion']
            job.job_type = tag
        except Exception as e:
            self.logger.warning('存储失败', e)
            self.logger.warning(job_dict['publisherId'])
        self.safe_commit(job)

    def switch_str(self, value_list):
        value_str = ''
        try:
            if type(value_list) != list:
                return str(value_list)

            for value in value_list:
                value_str = value_str + value + ';'
        except Exception as e:
            self.logger.warning('转化失败', e)
        return value_str

    def control(self):
        tag_list = []
        with open(FILE_PATH, 'r') as f:
            for line in f:
                tag_list.append(line.strip())
        for tag in tag_list:
            page = 1
            while (True):
                result = self.process(page, tag)
                if result:
                    break
                print("当前页数{}".format(page))
                page += 1
            self.logger.warning(tag + '共发起了' + str(page) + '请求')