Example #1
0
class SpiderUtil():
    def __int__(self):
        self.logger = MyLog().getLogger()

    def getSoup(self, url):
        req = request.Request(url, headers=Config().getHeader())
        for i in range(Config().getMAX_NUM()):
            try:
                resp = request.urlopen(req)
            except urllib.error.URLError as e:
                if i < Config().getMAX_NUM() - 1:
                    continue
                else:
                    self.logger.error("{}:{}:次之后还是失败".format(
                        url,
                        Config().getMAX_NUM()))
                    return

        content = resp.read()
        soup = BeautifulSoup(content, "lxml")
        return soup

    def getSoupByWebDriver(self, url):
        webdriver = Webdriver()
        content = webdriver.getPage_source(url)
        soup = BeautifulSoup(content, "lxml")
        webdriver.close()
        return soup
Example #2
0
class Webdriver():

    # 初始化并加载浏览器
    def __init__(self):

        __driver = "geckodriver"
        self.logger = MyLog().getLogger()

        # 获取驱动目录
        driverpath = ""
        if platform.system() == "Windows":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver + ".exe")

        elif platform.system() == "Linux":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver)

        else:
            self.logger.error("浏览器驱动文件出错:未在以下文件夹下"
                              "查找到驱动文件{}:{}".format(
                                  __driver, os.path.dirname(driverpath)))

        #print(driverpath)

        # 设置Firefox的启动选项
        options = webdriver.FirefoxOptions()
        #options.add_argument('-headless')

        self.browser = webdriver.Firefox(firefox_options=options,
                                         executable_path=driverpath)

        # 隐式等待10秒,待页面元素全部加载完毕
        self.browser.implicitly_wait(10)

    # 获取页面源代码
    def getPage_source(self, url):
        self.browser.get(url)
        # 刷新页面获取完整的页面
        self.browser.refresh()
        time.sleep(0.3)
        return self.browser.page_source

    # 获取cookie
    def getCookies(self, url):
        self.browser.get(url)
        # 刷新页面  重新加载页面
        self.browser.refresh()
        time.sleep(0.3)
        res = self.browser.get_cookies()
        # (list)res = [{'domain': 'www.cnvd.org.cn', 'httpOnly': True, 'expiry': 1527519798.543155, 'secure': False, 'value': '1c652993f3cfb95e68057050a70b69ef', 'name': '__jsluid', 'path': '/'}, {'domain': 'www.cnvd.org.cn', 'httpOnly': False, 'expiry': 1495987361, 'secure': False, 'value': '1495983761.518|0|lKyWZPLs%2FizLz8vTlbysQtasKFw%3D', 'name': '__jsl_clearance', 'path': '/'}]
        cookie = ""
        for r in res:
            cookie += (r['name'] + "=" + r["value"] + ";")
        return cookie

    # 关闭浏览器
    def close(self):
        self.browser.close()
Example #3
0
class CnnvdSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.cnnvdDao = CnnvdDao()

    def spiderAll(self, ):
        start_url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
        urls = self.getUrls(start_url)
        for url in urls:
            data = self.getDetailData(url)
            self.cnnvdDao.insert(data)

    def getUrls(self, start_url):
        urls = []
        soup = SpiderUtil().getSoup(start_url)

        page = self.getTotalPage(soup)

        #倒序爬取,即从最后一页开始爬取
        for i in range(page):
            self.getDetailUrls(i, urls)
        return urls

    def getTotalPage(self, soup):
        # 获取总条数
        pageText = soup.find('div', class_='page').getText().split("\n")
        totalNum = 0
        for text in pageText:
            if text != '':
                totalNum = int(re.sub("\D", "", text))
                break
        if totalNum == 0:
            self.logger.error("getTotalNum Error")

        if totalNum % 10 != 0:
            page = int(totalNum / 10 + 1)
        else:
            page = int(totalNum / 10)
        return page

    def getDetailUrls(self, page, urls):
        url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=" + str(
            page)
        soup = SpiderUtil().getSoup(url)

        list_list = soup.find('div', class_='list_list')
        urlList = list_list.findAll('div', class_='f1')
        for u in urlList:
            urls.append(u.a['href'])

    def getDetailData(self, url):
        data = {}
        data['detailUrl'] = url
        soup = SpiderUtil().getSoup(url)
        details = soup.find('div', class_='detail_xq w770')
        data['chname'] = details.h2.getText()
        for li in details.ul:
            if type(li) == bs4.element.Tag:
                texts = re.sub("(\t|\n|\r|\040)*", "", li.getText()).split(":")
                if texts[0] in Config().getCnnvdVulList():
                    codeName = Config().getCnnvdVulList()[texts[0]]
                    data[codeName] = texts[1]
                    print(codeName + ": " + data[codeName])
        #漏洞简介
        vul_descriptions = soup.find('div', class_='d_ldjj').findAll(
            'p', style='text-indent:2em')
        data['vul_description'] = ''
        for vul_description in vul_descriptions:
            data['vul_description'] += re.sub("(\t|\n|\r|\040)*", "",
                                              vul_description.getText())
        #漏洞公告,参考网址,受影响实体
        contents = soup.findAll('div', class_='d_ldjj m_t_20')
        for content in contents:
            title = content.find('div', class_='title_bt').getText()
            title = re.sub("(\t|\n|\r|\040)*", "", title)
            if title in Config().getCnnvdVulList():
                codeName = Config().getCnnvdVulList()[title]
                data[codeName] = ''
                p = content.findAll('p', style='text-indent:2em')
                for x in p:
                    data[codeName] += re.sub("(\t|\n|\r|\040)*", "",
                                             x.getText())
        return data
Example #4
0
class CNVDSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.cnvdDao = CNVDDao()

    def getUrls(self, num):
        urls = []
        start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num)
        self.logger.info("开始页面:{}".format(start_url))
        soup = SpiderUtil().getSoup(start_url)

        results = soup.find_all('td',
                                style="text-align:left;padding-left:10px;")
        self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results)))
        for result in results:
            urls.append(result.a['href'])

        return urls

    def getData(self, url):
        soup = SpiderUtil().getSoupByWebDriver(url)

        print(url)
        chname = soup.find("div", class_="blkContainerSblk").h1.getText()
        messageResult = {}
        messageResult['chname'] = chname

        tbody = soup.find("table", class_="gg_detail").tbody

        TRlist = tbody.find_all('tr')
        for trlist in TRlist[:-1]:
            if trlist.td.string == "影响产品":
                impact_productSum = ''
                if "影响产品" not in messageResult:
                    messageResult["impact_product"] = []
                for td in trlist.td.next_siblings:
                    if type(td) == bs4.element.Tag:
                        for k in td:
                            impact_product = ''
                            if type(k) == bs4.element.Tag:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.getText())
                            else:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.string)
                            if impact_product != "":
                                if impact_productSum == '':
                                    impact_productSum = impact_product
                                else:
                                    impact_productSum = impact_productSum + ',' + impact_product

                messageResult['impact_product'].append(impact_productSum)
            else:
                name = trlist.td.string
                if name in Config().getCnvdVulList():
                    codename = Config().getCnvdVulList()[name]
                    for td in trlist.td.next_siblings:
                        if type(td) == bs4.element.Tag:
                            tdText = re.sub(r"(\r|\t|\n|\040)*", "",
                                            td.getText())
                            if len(tdText):
                                if codename in messageResult:
                                    messageResult[codename].append(tdText)
                                else:
                                    messageResult[codename] = tdText
                else:
                    self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format(
                        url, chname, name))

        for name in Config().getCnvdVulList():
            if Config().getCnvdVulList()[name] not in messageResult:
                messageResult[Config().getCnvdVulList()[name]] = NULL
        self.cnvdDao.insert(messageResult)

    # 判断是否是已经爬过的信息
    # 即判断cnvd-id是否存在
    def isExist(self, cnvd_id):
        list = self.cnvdDao.selectByCNVDId(cnvd_id)
        if len(list) == 1:
            return True  # 表示存在该条信息
        elif len(list) == 0:
            return False  # 表示不存在该条信息
        else:
            self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format(
                cnvd_id, list))
            return

    def getPageNum(self):
        soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/")
        step = soup.find_all("a", class_="step")
        pageNum = step[len(step) - 1].get_text()
        return int(pageNum)

    # 爬取全部信息
    def spiderAll(self):
        pageNum = self.getPageNum()
        # 从最后一页开始爬取
        for i in range(pageNum)[::-1]:
            urls = self.getUrls(i * 20)

            for url in urls[::-1]:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该漏洞信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))

    # 更新数据
    def update(self):
        pageNum = self.getPageNum()
        # 从第一页开始更新数据
        for i in range(pageNum):
            urls = self.getUrls(i * 20)
            for url in urls:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))
                elif self.isExist(cnvdId) == True:
                    return  # 存在该信息 则退出


# 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据
Example #5
0
class UrlKeyWordSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.urlKeyWordDao = UrlKeyWordDao()

    def getUrlsByBaidu(self, keyWord):
        datas = []
        # 对url中中文处理
        url = "/s?wd=" + urllib.parse.quote(keyWord)
        self.getByBaidu(url, keyWord, datas)
        self.urlKeyWordDao.insert(datas)

    def getUrlsByGoogle(self, keyWord):
        datas = []
        # 对url中中文处理
        start_url = "https://www.google.com.hk/search?q=" + urllib.parse.quote(
            keyWord)
        browser = webdriver.Chrome()
        browser.get(start_url)
        while (self.isElementExist(browser, '下一页')):
            browser.find_element_by_link_text('下一页').click()
            soup = BeautifulSoup(browser.page_source,
                                 "html.parser",
                                 from_encoding='UTF-8')
            self.getDataByGoogle(soup, keyWord, datas)
            time.sleep(Config().getSleepTime())
        browser.close()
        self.urlKeyWordDao.insert(datas)

    def getDataByGoogle(self, soup, keyWord, datas):
        results = soup.findAll('div', class_='rc')
        for result in results:
            try:
                data = {}
                data['url'] = result.find('cite', class_='_Rm').getText()
                data['urlTitle'] = result.h3.getText()
                data['searchEngine'] = "Google"
                data['searchWord'] = keyWord
                datas.append(data)
            except Exception as e:
                self.logger.error("getData获取数据错误:[error]:{}……result:{}".format(
                    e,
                    str(result).replace(u'\xa0', u' ')))

    def getByBaidu(self, url, keyWord, datas):
        url = "https://www.baidu.com" + url
        soup = SpiderUtil().getSoup(url)

        self.getDataByBaidu(soup, keyWord, datas)

        nextUrl = self.getNextPageUrl(soup)

        if nextUrl != -1:
            self.getByBaidu(nextUrl, keyWord, datas)

    def getDataByBaidu(self, soup, keyWord, datas):
        results = soup.findAll('div', class_="f13")
        for result in results:
            try:
                data = {}
                data['url'] = result.a.getText()
                data['urlTitle'] = result.div['data-tools']
                data['searchEngine'] = "百度"
                data['searchWord'] = keyWord
                datas.append(data)
            except Exception as e:
                self.logger.error("getData获取数据错误:[error]:{}……result:{}".format(
                    e,
                    str(result).replace(u'\xa0', u' ')))

    def getNextPageUrl(self, soup):
        nextUrls = soup.find('div', id='page').findAll('a')
        if nextUrls == None:
            return -1
        if len(nextUrls) <= 0:
            return -1
        if nextUrls[len(nextUrls) - 1].getText() != '下一页>':
            return -1
        return nextUrls[len(nextUrls) - 1]['href']

    #判断是否存在此标签
    def isElementExist(self, browser, element):
        flag = True
        try:
            browser.find_element_by_link_text(element)
        except:
            flag = False
        return flag