class SpiderUtil(): def __int__(self): self.logger = MyLog().getLogger() def getSoup(self, url): req = request.Request(url, headers=Config().getHeader()) for i in range(Config().getMAX_NUM()): try: resp = request.urlopen(req) except urllib.error.URLError as e: if i < Config().getMAX_NUM() - 1: continue else: self.logger.error("{}:{}:次之后还是失败".format( url, Config().getMAX_NUM())) return content = resp.read() soup = BeautifulSoup(content, "lxml") return soup def getSoupByWebDriver(self, url): webdriver = Webdriver() content = webdriver.getPage_source(url) soup = BeautifulSoup(content, "lxml") webdriver.close() return soup
class Webdriver(): # 初始化并加载浏览器 def __init__(self): __driver = "geckodriver" self.logger = MyLog().getLogger() # 获取驱动目录 driverpath = "" if platform.system() == "Windows": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver + ".exe") elif platform.system() == "Linux": driverpath = os.path.join( os.path.abspath(os.path.dirname(__file__)), __driver) else: self.logger.error("浏览器驱动文件出错:未在以下文件夹下" "查找到驱动文件{}:{}".format( __driver, os.path.dirname(driverpath))) #print(driverpath) # 设置Firefox的启动选项 options = webdriver.FirefoxOptions() #options.add_argument('-headless') self.browser = webdriver.Firefox(firefox_options=options, executable_path=driverpath) # 隐式等待10秒,待页面元素全部加载完毕 self.browser.implicitly_wait(10) # 获取页面源代码 def getPage_source(self, url): self.browser.get(url) # 刷新页面获取完整的页面 self.browser.refresh() time.sleep(0.3) return self.browser.page_source # 获取cookie def getCookies(self, url): self.browser.get(url) # 刷新页面 重新加载页面 self.browser.refresh() time.sleep(0.3) res = self.browser.get_cookies() # (list)res = [{'domain': 'www.cnvd.org.cn', 'httpOnly': True, 'expiry': 1527519798.543155, 'secure': False, 'value': '1c652993f3cfb95e68057050a70b69ef', 'name': '__jsluid', 'path': '/'}, {'domain': 'www.cnvd.org.cn', 'httpOnly': False, 'expiry': 1495987361, 'secure': False, 'value': '1495983761.518|0|lKyWZPLs%2FizLz8vTlbysQtasKFw%3D', 'name': '__jsl_clearance', 'path': '/'}] cookie = "" for r in res: cookie += (r['name'] + "=" + r["value"] + ";") return cookie # 关闭浏览器 def close(self): self.browser.close()
class CnnvdSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnnvdDao = CnnvdDao() def spiderAll(self, ): start_url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag" urls = self.getUrls(start_url) for url in urls: data = self.getDetailData(url) self.cnnvdDao.insert(data) def getUrls(self, start_url): urls = [] soup = SpiderUtil().getSoup(start_url) page = self.getTotalPage(soup) #倒序爬取,即从最后一页开始爬取 for i in range(page): self.getDetailUrls(i, urls) return urls def getTotalPage(self, soup): # 获取总条数 pageText = soup.find('div', class_='page').getText().split("\n") totalNum = 0 for text in pageText: if text != '': totalNum = int(re.sub("\D", "", text)) break if totalNum == 0: self.logger.error("getTotalNum Error") if totalNum % 10 != 0: page = int(totalNum / 10 + 1) else: page = int(totalNum / 10) return page def getDetailUrls(self, page, urls): url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=" + str( page) soup = SpiderUtil().getSoup(url) list_list = soup.find('div', class_='list_list') urlList = list_list.findAll('div', class_='f1') for u in urlList: urls.append(u.a['href']) def getDetailData(self, url): data = {} data['detailUrl'] = url soup = SpiderUtil().getSoup(url) details = soup.find('div', class_='detail_xq w770') data['chname'] = details.h2.getText() for li in details.ul: if type(li) == bs4.element.Tag: texts = re.sub("(\t|\n|\r|\040)*", "", li.getText()).split(":") if texts[0] in Config().getCnnvdVulList(): codeName = Config().getCnnvdVulList()[texts[0]] data[codeName] = texts[1] print(codeName + ": " + data[codeName]) #漏洞简介 vul_descriptions = soup.find('div', class_='d_ldjj').findAll( 'p', style='text-indent:2em') data['vul_description'] = '' for vul_description in vul_descriptions: data['vul_description'] += re.sub("(\t|\n|\r|\040)*", "", vul_description.getText()) #漏洞公告,参考网址,受影响实体 contents = soup.findAll('div', class_='d_ldjj m_t_20') for content in contents: title = content.find('div', class_='title_bt').getText() title = re.sub("(\t|\n|\r|\040)*", "", title) if title in Config().getCnnvdVulList(): codeName = Config().getCnnvdVulList()[title] data[codeName] = '' p = content.findAll('p', style='text-indent:2em') for x in p: data[codeName] += re.sub("(\t|\n|\r|\040)*", "", x.getText()) return data
class CNVDSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.cnvdDao = CNVDDao() def getUrls(self, num): urls = [] start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num) self.logger.info("开始页面:{}".format(start_url)) soup = SpiderUtil().getSoup(start_url) results = soup.find_all('td', style="text-align:left;padding-left:10px;") self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results))) for result in results: urls.append(result.a['href']) return urls def getData(self, url): soup = SpiderUtil().getSoupByWebDriver(url) print(url) chname = soup.find("div", class_="blkContainerSblk").h1.getText() messageResult = {} messageResult['chname'] = chname tbody = soup.find("table", class_="gg_detail").tbody TRlist = tbody.find_all('tr') for trlist in TRlist[:-1]: if trlist.td.string == "影响产品": impact_productSum = '' if "影响产品" not in messageResult: messageResult["impact_product"] = [] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: for k in td: impact_product = '' if type(k) == bs4.element.Tag: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.getText()) else: impact_product = re.sub( "(\t|\n|\r|\040)*", "", k.string) if impact_product != "": if impact_productSum == '': impact_productSum = impact_product else: impact_productSum = impact_productSum + ',' + impact_product messageResult['impact_product'].append(impact_productSum) else: name = trlist.td.string if name in Config().getCnvdVulList(): codename = Config().getCnvdVulList()[name] for td in trlist.td.next_siblings: if type(td) == bs4.element.Tag: tdText = re.sub(r"(\r|\t|\n|\040)*", "", td.getText()) if len(tdText): if codename in messageResult: messageResult[codename].append(tdText) else: messageResult[codename] = tdText else: self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format( url, chname, name)) for name in Config().getCnvdVulList(): if Config().getCnvdVulList()[name] not in messageResult: messageResult[Config().getCnvdVulList()[name]] = NULL self.cnvdDao.insert(messageResult) # 判断是否是已经爬过的信息 # 即判断cnvd-id是否存在 def isExist(self, cnvd_id): list = self.cnvdDao.selectByCNVDId(cnvd_id) if len(list) == 1: return True # 表示存在该条信息 elif len(list) == 0: return False # 表示不存在该条信息 else: self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format( cnvd_id, list)) return def getPageNum(self): soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/") step = soup.find_all("a", class_="step") pageNum = step[len(step) - 1].get_text() return int(pageNum) # 爬取全部信息 def spiderAll(self): pageNum = self.getPageNum() # 从最后一页开始爬取 for i in range(pageNum)[::-1]: urls = self.getUrls(i * 20) for url in urls[::-1]: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该漏洞信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) # 更新数据 def update(self): pageNum = self.getPageNum() # 从第一页开始更新数据 for i in range(pageNum): urls = self.getUrls(i * 20) for url in urls: u = url.split("/") cnvdId = u[len(u) - 1] if self.isExist(cnvdId) == False: try: self.getData(url) # 不存在该信息则获取并插入 except Exception as excep: self.logger.error("getDataError{}".format(excep)) elif self.isExist(cnvdId) == True: return # 存在该信息 则退出 # 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据
class UrlKeyWordSpider(): def __init__(self): self.header = Config().getHeader() self.logger = MyLog().getLogger() self.urlKeyWordDao = UrlKeyWordDao() def getUrlsByBaidu(self, keyWord): datas = [] # 对url中中文处理 url = "/s?wd=" + urllib.parse.quote(keyWord) self.getByBaidu(url, keyWord, datas) self.urlKeyWordDao.insert(datas) def getUrlsByGoogle(self, keyWord): datas = [] # 对url中中文处理 start_url = "https://www.google.com.hk/search?q=" + urllib.parse.quote( keyWord) browser = webdriver.Chrome() browser.get(start_url) while (self.isElementExist(browser, '下一页')): browser.find_element_by_link_text('下一页').click() soup = BeautifulSoup(browser.page_source, "html.parser", from_encoding='UTF-8') self.getDataByGoogle(soup, keyWord, datas) time.sleep(Config().getSleepTime()) browser.close() self.urlKeyWordDao.insert(datas) def getDataByGoogle(self, soup, keyWord, datas): results = soup.findAll('div', class_='rc') for result in results: try: data = {} data['url'] = result.find('cite', class_='_Rm').getText() data['urlTitle'] = result.h3.getText() data['searchEngine'] = "Google" data['searchWord'] = keyWord datas.append(data) except Exception as e: self.logger.error("getData获取数据错误:[error]:{}……result:{}".format( e, str(result).replace(u'\xa0', u' '))) def getByBaidu(self, url, keyWord, datas): url = "https://www.baidu.com" + url soup = SpiderUtil().getSoup(url) self.getDataByBaidu(soup, keyWord, datas) nextUrl = self.getNextPageUrl(soup) if nextUrl != -1: self.getByBaidu(nextUrl, keyWord, datas) def getDataByBaidu(self, soup, keyWord, datas): results = soup.findAll('div', class_="f13") for result in results: try: data = {} data['url'] = result.a.getText() data['urlTitle'] = result.div['data-tools'] data['searchEngine'] = "百度" data['searchWord'] = keyWord datas.append(data) except Exception as e: self.logger.error("getData获取数据错误:[error]:{}……result:{}".format( e, str(result).replace(u'\xa0', u' '))) def getNextPageUrl(self, soup): nextUrls = soup.find('div', id='page').findAll('a') if nextUrls == None: return -1 if len(nextUrls) <= 0: return -1 if nextUrls[len(nextUrls) - 1].getText() != '下一页>': return -1 return nextUrls[len(nextUrls) - 1]['href'] #判断是否存在此标签 def isElementExist(self, browser, element): flag = True try: browser.find_element_by_link_text(element) except: flag = False return flag