class CrawlBadips(Task):
    def __init__(self):
        super().__init__('badips 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.badips.com/get/list/any/2?age=7d'
        source = 'www.badips.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.strip()
            block = [ip, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("badips 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("badips 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlBambenekconsulting(Task):
    def __init__(self):
        super().__init__('bambenekconsulting 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []

        url = 'http://osint.bambenekconsulting.com/feeds/c2-dommasterlist-high.txt'
        source = 'osint.bambenekconsulting.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _domain in info:
            if _domain.startswith('#'): continue
            if _domain == '': continue
            domain = _domain.split(',')[0]
            description = re.search('Domain used by (.*)', _domain.split(',')[1]).group(1)
            updatetime = _domain.split(',')[2].split(' ')[0]
            block = [domain, updatetime, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0

        if len(domains) > 0:
            try:
                for domain, updatetime, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.updatetime = updatetime
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = updatetime
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("bambenekconsulting 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("bambenekconsulting 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlMaxmind(Task):
    def __init__(self):
        super().__init__('maxmind 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.maxmind.com/en/high-risk-ip-sample-list'
        source = 'www.maxmind.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _info)
        description = 'Higher Risk IP'
        for ip in _ips:
            block = [ip, description, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("maxmind 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("maxmind  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
Exemple #4
0
class CrawlMalwaredomains(Task):
    def __init__(self):
        super().__init__('malwaredomains 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://mirror4.malwaredomains.com/files/justdomains'
        source = 'mirror4.malwaredomains.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for domain in info:
            block = [domain, source]
            domains.append(block)

        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("malwaredomains 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("malwaredomains 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlCybersweat(Task):
    def __init__(self):
        super().__init__('cybersweat 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://cybersweat.shop/iprep/iprep_ramnode.txt' # ERROR
        source = 'cybersweat.shop'
        proxies = {'http': 'socks5://127.0.0.1:1080', 'https': 'socks5://127.0.0.1:1080'}
        _info = self.get(url=url, proxies=proxies)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split(';')[0].strip()
            updatetime = _ip.split(';')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("cybersweat 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("cybersweat 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlGithubusercontent(Task):
    def __init__(self):
        super().__init__('githubusercontent 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        urls = ['https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/botscout_1d.ipset',
                'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset'
                ]
        source = 'raw.githubusercontent.com'
        for url in urls:
            _info = self.get(url=url)
            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for _ip in info:
                if _ip.startswith('#'): continue
                if _ip == '': continue
                ip = _ip.strip()
                block = [ip, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("githubusercontent 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("githubusercontent 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Crawlemergingthreats(Task):
    def __init__(self):
        super().__init__('emergingthreats 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://rules.emergingthreats.net/open/suricata/rules/botcc.rules'
        source = 'rules.emergingthreats.net'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        blocks = re.findall(r'(\[.*\))', _info)
        for block in blocks:
            _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', block)
            description = re.search('classtype:(.*?);', block).group(1)
            for ip in _ips:
                block = [ip, description, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("emergingthreats 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("emergingthreats 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
Exemple #8
0
class CrawlRansomwaretracker(Task):
    def __init__(self):
        super().__init__('ransomwaretracker 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt'
        source = 'ransomwaretracker.abuse.ch'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        description = 'Ransomware'
        for domain in info:
            if domain.startswith('#'): continue
            if domain == '': continue
            block = [domain, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):
        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = _time
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("ransomwaretracker 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("ransomwaretracker 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('sync_sqlite')
        self.name = name
        self.time = time

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def process(self):
        pass

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info('同步成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def run(self):
        self.start()
        self.process()
        self.finish()
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.headers = {
            'Upgrade - Insecure - Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'http://www.baidu.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()
Exemple #11
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.cookies = {}
        self.headers = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.job_type + value.createTime + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html, tag):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url, tag):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass
Exemple #12
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.header = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101'
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass

    def is_ip(self, _str):
        p = re.compile(
            '^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$'
        )
        if p.match(_str):
            return True
        else:
            return False
class CrawlDataplane(Task):
    def __init__(self):
        super().__init__('dataplane 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        urls = ['https://dataplane.org/dnsrd.txt',
                'https://dataplane.org/dnsrdany.txt',
                'https://dataplane.org/dnsversion.txt',
                'https://dataplane.org/sipinvitation.txt',
                'https://dataplane.org/sipquery.txt',
                'https://dataplane.org/sipregistration.txt',
                'https://dataplane.org/sshclient.txt',
                'https://dataplane.org/sshpwauth.txt',
                'https://dataplane.org/vncrfb.txt'
                ]
        source = 'dataplane.org'
        for url in urls:
            _info = self.get(url=url)

            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for _ip in info:
                if _ip.startswith('#'): continue
                if _ip == '': continue
                asn = _ip.split('|')[0].strip()
                asname = _ip.split('|')[1].strip()
                ip = _ip.split('|')[2].strip()
                updatetime = _ip.split('|')[3].strip().split()[0]
                category = _ip.split('|')[4].strip()
                block = [ip, updatetime, source, asname, asn, category]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source, asname, asn, category in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        new_ip.asname = asname
                        new_ip.asn = asn
                        new_ip.category = category
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("dataplane 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("dataplane 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlMalwaredomainlist(Task):
    def __init__(self):
        super().__init__('dshield 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        urls = [
            'https://secure.dshield.org/feeds/suspiciousdomains_High.txt',
            'https://secure.dshield.org/feeds/suspiciousdomains_Medium.txt',
            'https://secure.dshield.org/feeds/suspiciousdomains_Low.txt'
        ]
        source = 'secure.dshield.org'
        for url in urls:

            obj = urlparse(url)
            description = obj.path.split('/')[-1].split('.')[0]
            _info = self.get(url=url)
            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for domain in info:
                if domain.startswith('#'): continue
                if domain == 'Site': continue
                block = [domain, source, description]
                domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, source, description in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.source = source
                        new_domain.description = description
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("dshield 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("dshield 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlNetlab(Task):
    def __init__(self):
        super().__init__('netlab 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://data.netlab.360.com/feeds/dga/dga.txt'
        # url='http://67.209.191.170:81/1.txt'
        source = 'data.netlab.360.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _domain in info:
            if _domain.startswith('#'): continue
            if _domain == '': continue
            domain = _domain.split()[1]
            description = _domain.split()[0]
            updatetime = _domain.split()[2]
            block = [domain, updatetime, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        if len(domains) > 0:
            try:
                for domain, updatetime, description, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.updatetime = updatetime
                        new_domain.description = description
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("netlab 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("netlab 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
Exemple #16
0
class CrawlAlienvault(Task):
    def __init__(self):
        super().__init__('alienvault 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://reputation.alienvault.com/reputation.generic'
        source = 'reputation.alienvault.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _ip)[0]
            des = re.findall(r'# (.*?)(([A-Z]{2})|(\s)?),', _ip)
            description = des[0][0]
            area = des[0][1] + ',' + re.findall(r',(.*?),', _ip)[0]
            block = [ip, description, area, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, area, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.area = area
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("alienvault 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("alienvault 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlRulez(Task):
    def __init__(self):
        super().__init__('rulez 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://danger.rulez.sk/projects/bruteforceblocker/blist.php'
        source = 'danger.rulez.sk'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split('#')[0].strip()
            updatetime = _ip.split('#')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("rulez 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("rulez  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
class CrawlCybercrime(Task):
    def __init__(self):
        super().__init__('cybercrime 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://cybercrime-tracker.net/ccam.php'
        source = 'cybercrime-tracker.net'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        soup = BeautifulSoup(_info, 'lxml')
        table = soup.findChildren('tbody')[2]
        rows = table.findChildren('tr', attrs={'class': 'monitoring'})
        for row in rows:
            date_str = row.findChildren('td')[1].string
            time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S")
            updatetime = time.strftime("%Y-%m-%d", time_obj)
            domain = row.findChildren('td')[2].string
            hashstr = row.findChildren('td')[3].string
            if self.is_ip(domain): continue
            block = [domain, updatetime, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0

        if len(domains) > 0:
            try:
                for domain, updatetime, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.updatetime = updatetime
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("cybercrime 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("cybercrime 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))