class CrawlBadips(Task): def __init__(self): super().__init__('badips 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://www.badips.com/get/list/any/2?age=7d' source = 'www.badips.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.strip() block = [ip, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("badips 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("badips 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlBambenekconsulting(Task): def __init__(self): super().__init__('bambenekconsulting 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://osint.bambenekconsulting.com/feeds/c2-dommasterlist-high.txt' source = 'osint.bambenekconsulting.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') for _domain in info: if _domain.startswith('#'): continue if _domain == '': continue domain = _domain.split(',')[0] description = re.search('Domain used by (.*)', _domain.split(',')[1]).group(1) updatetime = _domain.split(',')[2].split(' ')[0] block = [domain, updatetime, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, description, source in domains: flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("bambenekconsulting 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("bambenekconsulting 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlMaxmind(Task): def __init__(self): super().__init__('maxmind 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://www.maxmind.com/en/high-risk-ip-sample-list' source = 'www.maxmind.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _info) description = 'Higher Risk IP' for ip in _ips: block = [ip, description, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("maxmind 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("maxmind 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlMalwaredomains(Task): def __init__(self): super().__init__('malwaredomains 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://mirror4.malwaredomains.com/files/justdomains' source = 'mirror4.malwaredomains.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for domain in info: block = [domain, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("malwaredomains 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("malwaredomains 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlCybersweat(Task): def __init__(self): super().__init__('cybersweat 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://cybersweat.shop/iprep/iprep_ramnode.txt' # ERROR source = 'cybersweat.shop' proxies = {'http': 'socks5://127.0.0.1:1080', 'https': 'socks5://127.0.0.1:1080'} _info = self.get(url=url, proxies=proxies) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.split(';')[0].strip() updatetime = _ip.split(';')[1].strip().split()[0].strip() block = [ip, updatetime, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("cybersweat 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("cybersweat 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlGithubusercontent(Task): def __init__(self): super().__init__('githubusercontent 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] urls = ['https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/botscout_1d.ipset', 'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset' ] source = 'raw.githubusercontent.com' for url in urls: _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.strip() block = [ip, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("githubusercontent 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("githubusercontent 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Crawlemergingthreats(Task): def __init__(self): super().__init__('emergingthreats 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'http://rules.emergingthreats.net/open/suricata/rules/botcc.rules' source = 'rules.emergingthreats.net' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None blocks = re.findall(r'(\[.*\))', _info) for block in blocks: _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', block) description = re.search('classtype:(.*?);', block).group(1) for ip in _ips: block = [ip, description, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("emergingthreats 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("emergingthreats 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlRansomwaretracker(Task): def __init__(self): super().__init__('ransomwaretracker 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt' source = 'ransomwaretracker.abuse.ch' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') description = 'Ransomware' for domain in info: if domain.startswith('#'): continue if domain == '': continue block = [domain, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, description, source in domains: flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("ransomwaretracker 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("ransomwaretracker 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('sync_sqlite') self.name = name self.time = time def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def process(self): pass def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info('同步成功') except Exception as e: self.logger.warning(e) db.session.rollback() def run(self): self.start() self.process() self.finish()
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.headers = { 'Upgrade - Insecure - Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://www.baidu.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } self.cookies = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.title + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback()
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.cookies = {} self.headers = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.job_type + value.createTime + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback() def get_one_page(self, url): """ Crawl a url and return the HTML code for this url. Args: url: The url you need to crawl. Returns: the HTML code for this url """ # try: # response = requests.get(url, headers=self.headers) # except Exception as e: # self.logger.warning(url + ' ' + str(e)) # return False # return response.text response = self.get(url) return response def run_crawl(self): """ Traverse all the pages you need to crawl. """ pass def handle_list_html(self, html, tag): """ The analysis page gets all the urls you need to crawl. Args: html: The page HTML that needs to be analyzed """ pass def handle_info_html(self, html, url, tag): """ The analysis page extracts all vulnerability information and commit to database. Args: html: The page HTML that needs to be analyzed url: The page url that needs to be analyzed """ pass
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.header = { 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101' } self.cookies = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.title + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback() def get_one_page(self, url): """ Crawl a url and return the HTML code for this url. Args: url: The url you need to crawl. Returns: the HTML code for this url """ # try: # response = requests.get(url, headers=self.headers) # except Exception as e: # self.logger.warning(url + ' ' + str(e)) # return False # return response.text response = self.get(url) return response def run_crawl(self): """ Traverse all the pages you need to crawl. """ pass def handle_list_html(self, html): """ The analysis page gets all the urls you need to crawl. Args: html: The page HTML that needs to be analyzed """ pass def handle_info_html(self, html, url): """ The analysis page extracts all vulnerability information and commit to database. Args: html: The page HTML that needs to be analyzed url: The page url that needs to be analyzed """ pass def is_ip(self, _str): p = re.compile( '^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$' ) if p.match(_str): return True else: return False
class CrawlDataplane(Task): def __init__(self): super().__init__('dataplane 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] urls = ['https://dataplane.org/dnsrd.txt', 'https://dataplane.org/dnsrdany.txt', 'https://dataplane.org/dnsversion.txt', 'https://dataplane.org/sipinvitation.txt', 'https://dataplane.org/sipquery.txt', 'https://dataplane.org/sipregistration.txt', 'https://dataplane.org/sshclient.txt', 'https://dataplane.org/sshpwauth.txt', 'https://dataplane.org/vncrfb.txt' ] source = 'dataplane.org' for url in urls: _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue asn = _ip.split('|')[0].strip() asname = _ip.split('|')[1].strip() ip = _ip.split('|')[2].strip() updatetime = _ip.split('|')[3].strip().split()[0] category = _ip.split('|')[4].strip() block = [ip, updatetime, source, asname, asn, category] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source, asname, asn, category in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source new_ip.asname = asname new_ip.asn = asn new_ip.category = category db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("dataplane 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("dataplane 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlMalwaredomainlist(Task): def __init__(self): super().__init__('dshield 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] urls = [ 'https://secure.dshield.org/feeds/suspiciousdomains_High.txt', 'https://secure.dshield.org/feeds/suspiciousdomains_Medium.txt', 'https://secure.dshield.org/feeds/suspiciousdomains_Low.txt' ] source = 'secure.dshield.org' for url in urls: obj = urlparse(url) description = obj.path.split('/')[-1].split('.')[0] _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for domain in info: if domain.startswith('#'): continue if domain == 'Site': continue block = [domain, source, description] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, source, description in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.source = source new_domain.description = description db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("dshield 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("dshield 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlNetlab(Task): def __init__(self): super().__init__('netlab 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://data.netlab.360.com/feeds/dga/dga.txt' # url='http://67.209.191.170:81/1.txt' source = 'data.netlab.360.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _domain in info: if _domain.startswith('#'): continue if _domain == '': continue domain = _domain.split()[1] description = _domain.split()[0] updatetime = _domain.split()[2] block = [domain, updatetime, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, description, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("netlab 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("netlab 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlAlienvault(Task): def __init__(self): super().__init__('alienvault 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://reputation.alienvault.com/reputation.generic' source = 'reputation.alienvault.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _ip)[0] des = re.findall(r'# (.*?)(([A-Z]{2})|(\s)?),', _ip) description = des[0][0] area = des[0][1] + ',' + re.findall(r',(.*?),', _ip)[0] block = [ip, description, area, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, area, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.area = area new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("alienvault 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("alienvault 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlRulez(Task): def __init__(self): super().__init__('rulez 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'http://danger.rulez.sk/projects/bruteforceblocker/blist.php' source = 'danger.rulez.sk' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.split('#')[0].strip() updatetime = _ip.split('#')[1].strip().split()[0].strip() block = [ip, updatetime, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("rulez 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("rulez 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlCybercrime(Task): def __init__(self): super().__init__('cybercrime 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://cybercrime-tracker.net/ccam.php' source = 'cybercrime-tracker.net' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None soup = BeautifulSoup(_info, 'lxml') table = soup.findChildren('tbody')[2] rows = table.findChildren('tr', attrs={'class': 'monitoring'}) for row in rows: date_str = row.findChildren('td')[1].string time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S") updatetime = time.strftime("%Y-%m-%d", time_obj) domain = row.findChildren('td')[2].string hashstr = row.findChildren('td')[3].string if self.is_ip(domain): continue block = [domain, updatetime, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("cybercrime 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("cybercrime 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))