class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('sync_sqlite') self.name = name self.time = time def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def process(self): pass def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info('同步成功') except Exception as e: self.logger.warning(e) db.session.rollback() def run(self): self.start() self.process() self.finish()
class BaseReq: def __init__(self, is_crawl=True): self.ses = db.session self.is_crawl = is_crawl self.logger = ContextLogger('crawl') def _request(self, url, method='post', timeout=20, retry=5, **kwargs): if kwargs.get('headers'): headers = kwargs['headers'] else: headers = {} if kwargs.get('cookies'): cookies = kwargs['cookies'] else: cookies = {} try: resp = requests.request(method, '{}'.format(url), timeout=timeout, headers=headers, cookies=cookies, **kwargs) except Exception as e: self.logger.warning(e) if retry > 0: return self._request(url, method, timeout, retry=retry - 1, **kwargs) else: return None if resp.status_code != 200 and retry > 0: return self._request(url, method, timeout, retry=retry - 1, **kwargs) if self.is_crawl: return resp.text else: try: data = resp.json() except Exception as e: self.logger.warning(e) data = None return data def get(self, url, **kwargs): return self._request(url, method='get', **kwargs) def post(self, url, **kwargs): return self._request(url, method='post', **kwargs)
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.headers = { 'Upgrade - Insecure - Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://www.baidu.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } self.cookies = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.title + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback()
class CrawlBambenekconsulting(Task): def __init__(self): super().__init__('bambenekconsulting 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://osint.bambenekconsulting.com/feeds/c2-dommasterlist-high.txt' source = 'osint.bambenekconsulting.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') for _domain in info: if _domain.startswith('#'): continue if _domain == '': continue domain = _domain.split(',')[0] description = re.search('Domain used by (.*)', _domain.split(',')[1]).group(1) updatetime = _domain.split(',')[2].split(' ')[0] block = [domain, updatetime, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, description, source in domains: flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("bambenekconsulting 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("bambenekconsulting 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlBadips(Task): def __init__(self): super().__init__('badips 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://www.badips.com/get/list/any/2?age=7d' source = 'www.badips.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.strip() block = [ip, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("badips 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("badips 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlMaxmind(Task): def __init__(self): super().__init__('maxmind 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://www.maxmind.com/en/high-risk-ip-sample-list' source = 'www.maxmind.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _info) description = 'Higher Risk IP' for ip in _ips: block = [ip, description, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("maxmind 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("maxmind 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class BaseReq: def __init__(self, is_crawl=True): self.ses = db.session self.is_crawl = is_crawl self.logger = ContextLogger('crawl') self.headers = { 'Host': 'www.lagou.com', 'Upgrade - Insecure - Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Content-Type': 'multipart/form-data' } def _request(self, url, method='post', timeout=20, retry=5, **kwargs): if kwargs.get('cookies'): cookies = kwargs['cookies'] else: cookies = {} try: resp = requests.request(method, '{}'.format(url), timeout=timeout, headers=self.headers, cookies=cookies, **kwargs) except Exception as e: if retry > 0: return self._request(url, method, timeout, retry=retry-1, **kwargs) else: self.logger.warning('请求失败', e) return None if resp.status_code != 200 and retry > 0: return self._request(url, method, timeout, retry=retry-1, **kwargs) if self.is_crawl: return resp.text else: try: data = resp.json() if (data['success'] is False or data is None) and retry > 0: time.sleep(3) return self._request(url, method, timeout, retry=retry - 1, **kwargs) except Exception as e: if retry > 0: return self._request(url, method, timeout, retry=retry - 1, **kwargs) else: self.logger.warning('解析失败', e) self.logger.warning(url) data = None return data def get(self, url, **kwargs): return self._request(url, method='get', **kwargs) def post(self, url, **kwargs): return self._request(url, method='post', **kwargs)
class CrawlMalwaredomains(Task): def __init__(self): super().__init__('malwaredomains 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://mirror4.malwaredomains.com/files/justdomains' source = 'mirror4.malwaredomains.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for domain in info: block = [domain, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("malwaredomains 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("malwaredomains 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlGithubusercontent(Task): def __init__(self): super().__init__('githubusercontent 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] urls = ['https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/botscout_1d.ipset', 'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset' ] source = 'raw.githubusercontent.com' for url in urls: _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.strip() block = [ip, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("githubusercontent 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("githubusercontent 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class Crawlemergingthreats(Task): def __init__(self): super().__init__('emergingthreats 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'http://rules.emergingthreats.net/open/suricata/rules/botcc.rules' source = 'rules.emergingthreats.net' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None blocks = re.findall(r'(\[.*\))', _info) for block in blocks: _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', block) description = re.search('classtype:(.*?);', block).group(1) for ip in _ips: block = [ip, description, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("emergingthreats 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("emergingthreats 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlCybersweat(Task): def __init__(self): super().__init__('cybersweat 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://cybersweat.shop/iprep/iprep_ramnode.txt' # ERROR source = 'cybersweat.shop' proxies = {'http': 'socks5://127.0.0.1:1080', 'https': 'socks5://127.0.0.1:1080'} _info = self.get(url=url, proxies=proxies) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.split(';')[0].strip() updatetime = _ip.split(';')[1].strip().split()[0].strip() block = [ip, updatetime, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("cybersweat 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("cybersweat 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlRansomwaretracker(Task): def __init__(self): super().__init__('ransomwaretracker 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt' source = 'ransomwaretracker.abuse.ch' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None "+source) return None info = _info.split('\n') description = 'Ransomware' for domain in info: if domain.startswith('#'): continue if domain == '': continue block = [domain, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, description, source in domains: flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("ransomwaretracker 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("ransomwaretracker 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlRulez(Task): def __init__(self): super().__init__('rulez 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'http://danger.rulez.sk/projects/bruteforceblocker/blist.php' source = 'danger.rulez.sk' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = _ip.split('#')[0].strip() updatetime = _ip.split('#')[1].strip().split()[0].strip() block = [ip, updatetime, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("rulez 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("rulez 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlCnnvd(Task): def __init__(self): super().__init__('douban数据爬取') self.logger = ContextLogger('douban') def run_crawl(self): begin_base_url = 'https://book.douban.com/tag/' middle_base_url = '?start=' end_base_url = '&type=S' for tag in tag_list: start = 0 while True: page_url = begin_base_url + tag + middle_base_url + str( start) + end_base_url print(page_url) html = self.get(page_url) start += 20 if html: result = self.handle_list_html(html, tag) if result: break def handle_list_html(self, html, tag): soup = BeautifulSoup(html, 'lxml') items = soup.find_all(class_='subject-item') if not items: return True href_list = [] for item in items: href = item.div.a['href'] href_list.append(href) for href in href_list: html = self.get(href) if html: result = self.handle_info_html(html, tag) if result: continue return False def handle_info_html(self, html, type_tag): soup = BeautifulSoup(html, 'lxml') book = Book() # type_id = db.session.query(Type).filter_by(title=tag).first().id try: title = soup.h1.span.get_text() info = soup.find(class_='article').find(class_='indent').find( class_='subjectwrap clearfix').find( class_='subject clearfix').find(id='info') string = info.get_text().strip() string = string.replace(' ', '') string = string.replace(' ', '') string = string.replace('\n', '') tag_list = [ '出版社:', '出品方:', '副标题:', '原作名:', '译者:', '出版年:', '页数:', '定价:', '装帧:', '丛书:', 'ISBN:' ] value_list = [] if '作者:' in string: string = string.replace('作者:', '') flag = 0 for tag in tag_list: if tag in string: value = string.split(tag)[0] value_list.append(value) if flag != 0: for i in range(flag): value_list.append('') flag = 0 else: flag += 1 continue string = string.split(tag)[1] if tag == 'ISBN:': value_list.append(string) author = value_list[0] publisher = value_list[1] producer = value_list[2] subtitle = value_list[3] original_title = value_list[4] translator = value_list[5] year_of_publisher = value_list[6] pages = value_list[7] price = value_list[8] binding = value_list[9] series = value_list[10] isbn = value_list[11] pic_href = soup.find(class_='article').find(class_='indent').find( class_='subjectwrap clearfix').find( class_='subject clearfix').find(id='mainpic').a['href'] score = soup.find( class_='rating_self clearfix').strong.get_text().strip() score_people = soup.find(class_='rating_people').get_text() related_info = soup.find(class_='related_info') infos = related_info.find_all(class_='indent')[:2] content_info = str(infos[0].find(class_='intro')).replace( '<div class="intro">', '') author_info = str(infos[1].find(class_='intro')).replace( '<div class="intro">', '') book.title = title book.author = author book.publisher = publisher book.producer = producer book.translator = translator book.subtitle = subtitle book.original_title = original_title book.year_of_publisher = year_of_publisher book.pages = pages book.price = price book.binding = binding book.series = series book.isbn = isbn book.score = score book.score_people = score_people book.type = type_tag book.content_info = content_info book.author_info = author_info book.pic_href = pic_href self.safe_commit(book) except Exception as e: self.logger.warning('爬起失败', e) return True return False
class CrawlCnnvd(Task): def __init__(self): super().__init__('cnnvd数据爬取') self.logger = ContextLogger('task_cnnvd') def get_api_id(self, cnnvd_str): str_1, str_2, str_3 = cnnvd_str.split('-') if len(str_3) == 3: return str_2 + '0' + str_3 return str_2 + str_3 def api_affect_product(self, id, name_str): data = {'cvCveid': id, 'counts': 5} json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag', data=data) if json_datas == '' or json_datas is None: return name_str else: json_datas = json.loads(json_datas) for json_data in json_datas: name_str = name_str + json_data['cpr_product_name'] + ';' return name_str def api_patchs(self, id, patch_dict): data = {'cvCveid': id, 'counts': 5} json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag', data=data) if json_datas == '' or json_datas is None: return patch_dict else: json_datas = json.loads(json_datas) for json_data in json_datas: patch_name = json_data['cp_cname'] patch_url = 'http://www.cnnvd.org.cn' + '/web/xxk/bdxqById.tag?id=' + str( json_data['cp_id']) patch_dict[patch_name] = patch_url return patch_dict def run_crawl(self): begin_base_url = 'http://www.cnnvd.org.cn' + '/web/vulnerability/querylist.tag?pageno=' end_base_url = '&repairLd=' index = 1 while index < 10597: next_page = begin_base_url + str(index) + end_base_url print(next_page) html = self.get_one_page(next_page) if html: result = self.handle_list_html(html) if result: break index += 1 def handle_list_html(self, html): soup = BeautifulSoup(html, 'lxml') for url in soup.find(class_='list_list').ul.find_all('li'): vul_url = 'http://www.cnnvd.org.cn' + url.div.a['href'] html = self.get_one_page(vul_url) if html: result = self.handle_info_html(html, vul_url) if result: return result return False def handle_info_html(self, html, url): soup = BeautifulSoup(html, 'lxml') try: simple_list = [] title = soup.find( class_="detail_xq w770").find('h2').get_text().strip() cnnvd_id = soup.find(class_="detail_xq w770").find('ul').find_all( 'li')[0].span.get_text().strip().split(':')[1] for li in soup.find( class_="detail_xq w770").find('ul').find_all('li')[1:]: if li.a: simple_list.append(li.a.get_text().strip()) # 如果时间超过一个月, 返回True,暂停爬取 if simple_list[5] < end_time: return True vulner_source = soup.find(class_="detail_xq w770").find( 'ul').find_all('li')[-1].span.get_text().strip().split(':')[1] list = [] for div in soup.find(class_="fl w770").find_all(class_='d_ldjj'): str_value = '' for p in div.find_all('p'): str_value += p.get_text().strip() list.append(str_value) if list[3] != '暂无': str_value = '' affect_products = soup.find(class_="fl w770").find_all( class_='d_ldjj')[3].find_all(class_='a_title2') for i in affect_products: str_value += i.get_text().strip() + ';' if len(affect_products) == 5: str_value = self.api_affect_product( self.get_api_id(cnnvd_id), str_value) list[3] = str_value if list[4] != '暂无': patch_dict = {} patchs = soup.find(class_="fl w770").find_all( class_='d_ldjj')[4].find_all(class_='a_title2') for i in patchs: name = i.get_text().strip() patch_url = 'http://www.cnnvd.org.cn' + i['href'] patch_dict[name] = patch_url if len(patchs) == 5: patch_dict = self.api_patchs(self.get_api_id(cnnvd_id), patch_dict) list[4] = patch_dict except Exception as e: self.logger.warning(url + '--' + str(e)) return False vul = db.session.query(Vulner).filter_by(cnnvd_id=cnnvd_id).first() if vul: print("查询成功") else: print("新数据") vul = Vulner() vul.title = title vul.vulner_source = vulner_source vul.cnnvd_id = cnnvd_id vul.url = url vul.level = simple_list[0] vul.cve_id = simple_list[1] vul.vulner_type = simple_list[2] vul.posted_time = simple_list[3] vul.threats_type = simple_list[4] vul.update_time = simple_list[5] vul.describe = list[0] vul.source = 'cnnvd' vul.solve_way = list[1] vul.refer_link = list[2] vul.affect_product = list[3] vul.patch = list[4] print(vul.title, '\n', vul.update_time, '\n', vul.cve_id, '\n', vul.url, '\n\n') self.safe_commit(vul) return False
class CrawlAlienvault(Task): def __init__(self): super().__init__('alienvault 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] url = 'https://reputation.alienvault.com/reputation.generic' source = 'reputation.alienvault.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue ip = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _ip)[0] des = re.findall(r'# (.*?)(([A-Z]{2})|(\s)?),', _ip) description = des[0][0] area = des[0][1] + ',' + re.findall(r',(.*?),', _ip)[0] block = [ip, description, area, source] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(ips) > 0: try: for ip, description, area, source in ips: flag = db.session.query(IP).filter( IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.description = description new_ip.area = area new_ip.source = source db.session.add(new_ip) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("alienvault 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("alienvault 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.cookies = {} self.headers = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.job_type + value.createTime + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback() def get_one_page(self, url): """ Crawl a url and return the HTML code for this url. Args: url: The url you need to crawl. Returns: the HTML code for this url """ # try: # response = requests.get(url, headers=self.headers) # except Exception as e: # self.logger.warning(url + ' ' + str(e)) # return False # return response.text response = self.get(url) return response def run_crawl(self): """ Traverse all the pages you need to crawl. """ pass def handle_list_html(self, html, tag): """ The analysis page gets all the urls you need to crawl. Args: html: The page HTML that needs to be analyzed """ pass def handle_info_html(self, html, url, tag): """ The analysis page extracts all vulnerability information and commit to database. Args: html: The page HTML that needs to be analyzed url: The page url that needs to be analyzed """ pass
class CrawlHackernews(Task): def __init__(self): super().__init__('hackernews数据爬取') self.logger = ContextLogger('task_hackernews') def run_crawl(self): base_url = 'http://hackernews.cc/page/' index = 1 while index < 275: next_page = base_url + str(index) html = self.get_one_page(next_page) if html: result = self.handle_list_html(html) if result: break index += 1 def handle_list_html(self, html): soup = BeautifulSoup(html, 'lxml') for article in soup.find(class_="classic-lists clearfix").find_all( id='article'): article_url = article.find(class_="classic-list-left").a['href'] html = self.get_one_page(article_url) if html: result = self.handle_info_html(html, article_url) if result: return result return False def handle_info_html(self, html, url): soup = BeautifulSoup(html, 'lxml') try: info_url = db.session.query(Infomation).filter_by(url=url).first() if info_url: return True title = soup.find(class_="post-details-right").find( class_="single-title").get_text().strip() posted_time = soup.find(class_="post-details-right").find( class_="light-post-meta").find_all('a')[1].get_text().strip() author = soup.find(class_="post-details-right").find( class_="light-post-meta").find_all('a')[0].get_text().strip() summary = soup.find( class_="post-body clearfix").find('p').get_text().strip() source = 'HackerNews' keys = '' for category in soup.find(class_="post-details-right").find( class_="light-post-meta").find( class_="post-category").find_all('a'): keys = keys + category.get_text().strip() + ';' except Exception as e: self.logger.warning(url + ' ' + str(e)) return False info = Infomation() info.source = source info.summary = summary info.keys = keys info.author = author info.posted_time = posted_time info.title = title info.url = url print(info.title, '\n', info.author, '\n', info.posted_time, '\n', info.url, '\n', info.keys, '\n', info.source, '\n', info.summary, '\n') self.safe_commit(info) return False
class Task(BaseReq): def __init__(self, name, time=None): super().__init__() self.logger = ContextLogger('task') self.name = name self.time = time self.header = { 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101' } self.cookies = {} def start(self): self.logger.info("Task Started: {}".format(self.name)) def exit(self): sys.exit() def finish(self): self.logger.info("Task Finished: {}".format(self.name)) def run(self): self.start() self.run_crawl() self.finish() def safe_commit(self, value): try: db.session.add(value) db.session.commit() self.logger.info(value.title + '提交成功') except Exception as e: self.logger.warning(e) db.session.rollback() def get_one_page(self, url): """ Crawl a url and return the HTML code for this url. Args: url: The url you need to crawl. Returns: the HTML code for this url """ # try: # response = requests.get(url, headers=self.headers) # except Exception as e: # self.logger.warning(url + ' ' + str(e)) # return False # return response.text response = self.get(url) return response def run_crawl(self): """ Traverse all the pages you need to crawl. """ pass def handle_list_html(self, html): """ The analysis page gets all the urls you need to crawl. Args: html: The page HTML that needs to be analyzed """ pass def handle_info_html(self, html, url): """ The analysis page extracts all vulnerability information and commit to database. Args: html: The page HTML that needs to be analyzed url: The page url that needs to be analyzed """ pass def is_ip(self, _str): p = re.compile( '^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$' ) if p.match(_str): return True else: return False
class CrawlFreebuf(Task): def __init__(self): super().__init__('LaGou数据爬取') self.logger = ContextLogger('task_lagou') self.is_crawl = False self.headers = { 'Host': 'www.lagou.com', 'Upgrade - Insecure - Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD', 'Connection': 'keep-alive', 'Cookie': 'user_trace_token=20171103191801-9206e24f-9ca2-40ab-95a3-23947c0b972a; _ga=GA1.2.545192972.1509707889; LGUID=20171103191805-a9838dac-c088-11e7-9704-5254005c3644; JSESSIONID=ABAAABAACDBABJB2EE720304E451B2CEFA1723CE83F19CC; _gat=1; LGSID=20171228225143-9edb51dd-ebde-11e7-b670-525400f775ce; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKkJPgBHAnny1nUKaLpx2oDfUXv9ItIF3kBAWM2-fDNu%26ck%3D3065.1.126.376.140.374.139.129%26shh%3Dwww.baidu.com%26sht%3Dmonline_3_dg%26wd%3D%26eqid%3Db0ec59d100013c7f000000055a4504f6; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171228225224-b6cc7abd-ebde-11e7-9f67-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; SEARCH_ID=3ec21cea985a4a5fa2ab279d868560c8', 'X-Requested-With': 'XMLHttpRequest', 'X-Anit-Forge-Token': 'None', 'X-Anit-Forge-Code': '0', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', } def process(self, page, tag): base_url = 'https://www.lagou.com/jobs/positionAjax.json?' params = { 'px': 'new', 'needAddtionalResult': 'false', 'isSchoolJob': '0' } data = {'first': 'false', 'pn': str(page), 'kd': tag} html = self.post(base_url, data=data, params=params) if html: try: count = int(html['content']['positionResult']['resultSize']) job_list = html['content']['positionResult']['result'] if count == 0: return True for job in job_list: self.storage(job, tag) except Exception as e: self.logger.warning('取值失败', e) self.logger.warning(base_url) def storage(self, job_dict, tag): for key in job_dict: job_dict[key] = self.switch_str(job_dict[key]) job = Job() try: job.companyId = job_dict['companyId'] job.positionName = job_dict['positionName'] job.workYear = job_dict['workYear'] job.education = job_dict['education'] job.jobNature = job_dict['jobNature'] job.companyLogo = job_dict['companyLogo'] job.salary = job_dict['salary'] job.city = job_dict['city'] job.financeStage = job_dict['financeStage'] job.industryField = job_dict['industryField'] job.positionId = job_dict['positionId'] job.approve = job_dict['approve'] job.createTime = job_dict['createTime'] job.positionAdvantage = job_dict['positionAdvantage'] job.companySize = job_dict['companySize'] job.companyLabelList = job_dict['companyLabelList'] job.publisherId = job_dict['publisherId'] job.score = job_dict['score'] job.district = job_dict['district'] job.companyShortName = job_dict['companyShortName'] job.positionLables = job_dict['positionLables'] job.industryLables = job_dict['industryLables'] job.businessZones = job_dict['businessZones'] job.longitude = job_dict['longitude'] job.latitude = job_dict['latitude'] job.adWord = job_dict['adWord'] job.formatCreateTime = job_dict['formatCreateTime'] job.hitags = job_dict['hitags'] job.resumeProcessRate = job_dict['resumeProcessRate'] job.resumeProcessDay = job_dict['resumeProcessDay'] job.companyFullName = job_dict['companyFullName'] job.imState = job_dict['imState'] job.lastLogin = job_dict['lastLogin'] job.explain = job_dict['explain'] job.plus = job_dict['plus'] job.pcShow = job_dict['pcShow'] job.appShow = job_dict['appShow'] job.deliver = job_dict['deliver'] job.gradeDescription = job_dict['gradeDescription'] job.promotionScoreExplain = job_dict['promotionScoreExplain'] job.firstType = job_dict['firstType'] job.secondType = job_dict['secondType'] job.isSchoolJob = job_dict['isSchoolJob'] job.subwayline = job_dict['subwayline'] job.stationname = job_dict['stationname'] job.linestaion = job_dict['linestaion'] job.job_type = tag except Exception as e: self.logger.warning('存储失败', e) self.logger.warning(job_dict['publisherId']) self.safe_commit(job) def switch_str(self, value_list): value_str = '' try: if type(value_list) != list: return str(value_list) for value in value_list: value_str = value_str + value + ';' except Exception as e: self.logger.warning('转化失败', e) return value_str def control(self): tag_list = [] with open(FILE_PATH, 'r') as f: for line in f: tag_list.append(line.strip()) for tag in tag_list: page = 1 while (True): result = self.process(page, tag) if result: break print("当前页数{}".format(page)) page += 1 self.logger.warning(tag + '共发起了' + str(page) + '请求')
class CrawlZJ(Task): def __init__(self): super().__init__('智联招聘数据爬取') self.logger = ContextLogger('task_zhilian') def run_crawl(self): tag_list = [] with open(FILE_PATH, 'r') as f: for line in f: tag_list.append(line.strip().lower()) for tag in tag_list: base_url1 = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw="' base_url2 = '&sb=1&sm=0&isfilter=0&fl=489&isadv=0&sg=0e48987408d24799b14e9053dd9a00c7&p=' index = 1 while index < 91: next_page = base_url1 + tag + base_url2 + str(index) html = self.get_one_page(next_page) if html: result = self.handle_list_html(html, tag) if result: break index += 1 def handle_list_html(self, html, tag): soup = BeautifulSoup(html, 'lxml') try: list_content = soup.find( class_="newlist_list_content").find_all("table") except Exception as e: self.logger.warning(e) return False for i, table in enumerate(list_content): if i == 0: continue try: position_url = table.tr.td.div.a['href'] position_name = table.tr.td.div.a.get_text().strip() except Exception as e: self.logger.warning(e) return False if tag not in position_name.lower(): return True html = self.get_one_page(position_url) if html: result = self.handle_info_html(html, position_url, tag) if result: return result return False def handle_info_html(self, html, url, tag): soup = BeautifulSoup(html, 'lxml') try: url = url positionName = soup.find( class_="inner-left fl").h1.get_text().strip() companyFullName = soup.find( class_="inner-left fl").h2.get_text().strip() companyLabelList = '' for i in soup.find(class_="inner-left fl").div.find_all('span'): companyLabelList = companyLabelList + i.get_text().strip( ) + ';' list_attr = [] for i in soup.find(class_="terminalpage-left").find( class_='terminal-ul clearfix').find_all('li'): list_attr.append(i.strong.get_text().strip()) except Exception as e: self.logger.warning(url + ' ' + str(e)) return False job = ZlJob() job.url = url job.positionName = positionName job.companyFullName = companyFullName job.companyLabelList = companyLabelList job.salary = list_attr[0] job.city = list_attr[1] job.createTime = list_attr[2] job.jobNature = list_attr[3] job.workYear = list_attr[4] job.education = list_attr[5] job.positionCount = list_attr[6] job.firstType = list_attr[7] job.job_type = tag self.safe_commit(job) return False
class CrawlDataplane(Task): def __init__(self): super().__init__('dataplane 数据爬取') self.logger = ContextLogger('threat_ip') def run_crawl(self): start = time.time() ips = [] urls = ['https://dataplane.org/dnsrd.txt', 'https://dataplane.org/dnsrdany.txt', 'https://dataplane.org/dnsversion.txt', 'https://dataplane.org/sipinvitation.txt', 'https://dataplane.org/sipquery.txt', 'https://dataplane.org/sipregistration.txt', 'https://dataplane.org/sshclient.txt', 'https://dataplane.org/sshpwauth.txt', 'https://dataplane.org/vncrfb.txt' ] source = 'dataplane.org' for url in urls: _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _ip in info: if _ip.startswith('#'): continue if _ip == '': continue asn = _ip.split('|')[0].strip() asname = _ip.split('|')[1].strip() ip = _ip.split('|')[2].strip() updatetime = _ip.split('|')[3].strip().split()[0] category = _ip.split('|')[4].strip() block = [ip, updatetime, source, asname, asn, category] ips.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(ips, source, crawl_time) def save_info(self, ips, source, crawl_time): start = time.time() all_count = len(ips) avail_count = 0 if len(ips) > 0: try: for ip, updatetime, source, asname, asn, category in ips: flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first() if flag is None: new_ip = IP() new_ip.ip = ip new_ip.updatetime = updatetime new_ip.source = source new_ip.asname = asname new_ip.asn = asn new_ip.category = category db.session.add(new_ip) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("dataplane 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count)) self.logger.info("dataplane 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
class CrawlMalwaredomainlist(Task): def __init__(self): super().__init__('dshield 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] urls = [ 'https://secure.dshield.org/feeds/suspiciousdomains_High.txt', 'https://secure.dshield.org/feeds/suspiciousdomains_Medium.txt', 'https://secure.dshield.org/feeds/suspiciousdomains_Low.txt' ] source = 'secure.dshield.org' for url in urls: obj = urlparse(url) description = obj.path.split('/')[-1].split('.')[0] _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for domain in info: if domain.startswith('#'): continue if domain == 'Site': continue block = [domain, source, description] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 _time = datetime.now().strftime("%Y-%m-%d") if len(domains) > 0: try: for domain, source, description in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.source = source new_domain.description = description db.session.add(new_domain) avail_count += 1 else: flag.updatetime = _time db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("dshield 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("dshield 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlCybercrime(Task): def __init__(self): super().__init__('cybercrime 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://cybercrime-tracker.net/ccam.php' source = 'cybercrime-tracker.net' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None soup = BeautifulSoup(_info, 'lxml') table = soup.findChildren('tbody')[2] rows = table.findChildren('tr', attrs={'class': 'monitoring'}) for row in rows: date_str = row.findChildren('td')[1].string time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S") updatetime = time.strftime("%Y-%m-%d", time_obj) domain = row.findChildren('td')[2].string hashstr = row.findChildren('td')[3].string if self.is_ip(domain): continue block = [domain, updatetime, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("cybercrime 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("cybercrime 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))
class CrawlFreebuf(Task): def __init__(self): super().__init__('freebuf数据爬取') self.logger = ContextLogger('task_freebuf') def run_crawl(self): base_url = 'http://www.freebuf.com/news/page/' index = 1 while index < 290: next_page = base_url + str(index) html = self.get_one_page(next_page) if html: result = self.handle_list_html(html) if result: break index += 1 def handle_list_html(self, html): soup = BeautifulSoup(html, 'lxml') for tag_div in soup.find_all("div", class_="news-img"): article_url = tag_div.a['href'] print(article_url) html = self.get_one_page(article_url) if html: result = self.handle_info_html(html, article_url) if result: return result return False def handle_info_html(self, html, url): print(url) soup = BeautifulSoup(html, 'lxml') try: url = url info_url = db.session.query(Infomation).filter_by(url=url).first() if info_url: print("已经存在") return True posted_time = soup.find(class_="property").find( class_="time").get_text().strip() title = soup.find(class_="articlecontent").find( class_="title").h2.get_text().strip() author = soup.find(class_="property").find( rel="author").get_text().strip() keys_list = soup.find(class_="property").find( class_="tags").find_all('a') key_str = '' for key in keys_list: key_str = key_str + key.get_text().strip() + ';' summary = '' for p_string in soup.find(id="contenttxt").find_all( style=re.compile("color: rgb\(0, 176, 80\);*$")): if p_string.get_text() is None: continue summary += p_string.get_text().strip() if summary == '': summary = soup.find( id="contenttxt").find('p').get_text().strip() except Exception as e: self.logger.warning(url + ' ' + str(e)) return False info = Infomation() info.title = title info.url = url info.posted_time = posted_time info.author = author info.summary = summary info.source = 'Freebuf' info.keys = key_str print(info.title, info.author, info.posted_time, info.url, key_str) self.safe_commit(info) return False
class CrawlNetlab(Task): def __init__(self): super().__init__('netlab 数据爬取') self.logger = ContextLogger('threat_domain') def run_crawl(self): start = time.time() domains = [] url = 'http://data.netlab.360.com/feeds/dga/dga.txt' # url='http://67.209.191.170:81/1.txt' source = 'data.netlab.360.com' _info = self.get(url=url) if _info is None: self.logger.warning("request returned None " + source) return None info = _info.split('\n') for _domain in info: if _domain.startswith('#'): continue if _domain == '': continue domain = _domain.split()[1] description = _domain.split()[0] updatetime = _domain.split()[2] block = [domain, updatetime, description, source] domains.append(block) stop = time.time() crawl_time = str(stop - start) + "秒" self.save_info(domains, source, crawl_time) def save_info(self, domains, source, crawl_time): start = time.time() all_count = len(domains) avail_count = 0 if len(domains) > 0: try: for domain, updatetime, description, source in domains: flag = db.session.query(Domain).filter( Domain.domain == domain, Domain.source == source).first() if flag is None: new_domain = Domain() new_domain.domain = domain new_domain.updatetime = updatetime new_domain.description = description new_domain.source = source db.session.add(new_domain) avail_count += 1 else: flag.updatetime = updatetime db.session.add(flag) db.session.commit() except Exception as e: self.logger.warning("Error writing to database" + str(e) + source) else: self.logger.warning("NO record found from: %s" % source) stop = time.time() storage_time = str(stop - start) + "秒" self.logger.info("netlab 共收集{0}条数据, 新数据{1}条".format( all_count, avail_count)) self.logger.info("netlab 抓取时间{0},数据遍历时间{1}".format( crawl_time, storage_time))