예제 #1
0
class SpiderUtil():
    def __int__(self):
        self.logger = MyLog().getLogger()

    def getSoup(self, url):
        req = request.Request(url, headers=Config().getHeader())
        for i in range(Config().getMAX_NUM()):
            try:
                resp = request.urlopen(req)
            except urllib.error.URLError as e:
                if i < Config().getMAX_NUM() - 1:
                    continue
                else:
                    self.logger.error("{}:{}:次之后还是失败".format(
                        url,
                        Config().getMAX_NUM()))
                    return

        content = resp.read()
        soup = BeautifulSoup(content, "lxml")
        return soup

    def getSoupByWebDriver(self, url):
        webdriver = Webdriver()
        content = webdriver.getPage_source(url)
        soup = BeautifulSoup(content, "lxml")
        webdriver.close()
        return soup
예제 #2
0
    def __init__(self):

        __driver = "geckodriver"
        self.logger = MyLog().getLogger()

        # 获取驱动目录
        driverpath = ""
        if platform.system() == "Windows":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver + ".exe")

        elif platform.system() == "Linux":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver)

        else:
            self.logger.error("浏览器驱动文件出错:未在以下文件夹下"
                              "查找到驱动文件{}:{}".format(
                                  __driver, os.path.dirname(driverpath)))

        #print(driverpath)

        # 设置Firefox的启动选项
        options = webdriver.FirefoxOptions()
        #options.add_argument('-headless')

        self.browser = webdriver.Firefox(firefox_options=options,
                                         executable_path=driverpath)

        # 隐式等待10秒,待页面元素全部加载完毕
        self.browser.implicitly_wait(10)
예제 #3
0
class Webdriver():

    # 初始化并加载浏览器
    def __init__(self):

        __driver = "geckodriver"
        self.logger = MyLog().getLogger()

        # 获取驱动目录
        driverpath = ""
        if platform.system() == "Windows":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver + ".exe")

        elif platform.system() == "Linux":
            driverpath = os.path.join(
                os.path.abspath(os.path.dirname(__file__)), __driver)

        else:
            self.logger.error("浏览器驱动文件出错:未在以下文件夹下"
                              "查找到驱动文件{}:{}".format(
                                  __driver, os.path.dirname(driverpath)))

        #print(driverpath)

        # 设置Firefox的启动选项
        options = webdriver.FirefoxOptions()
        #options.add_argument('-headless')

        self.browser = webdriver.Firefox(firefox_options=options,
                                         executable_path=driverpath)

        # 隐式等待10秒,待页面元素全部加载完毕
        self.browser.implicitly_wait(10)

    # 获取页面源代码
    def getPage_source(self, url):
        self.browser.get(url)
        # 刷新页面获取完整的页面
        self.browser.refresh()
        time.sleep(0.3)
        return self.browser.page_source

    # 获取cookie
    def getCookies(self, url):
        self.browser.get(url)
        # 刷新页面  重新加载页面
        self.browser.refresh()
        time.sleep(0.3)
        res = self.browser.get_cookies()
        # (list)res = [{'domain': 'www.cnvd.org.cn', 'httpOnly': True, 'expiry': 1527519798.543155, 'secure': False, 'value': '1c652993f3cfb95e68057050a70b69ef', 'name': '__jsluid', 'path': '/'}, {'domain': 'www.cnvd.org.cn', 'httpOnly': False, 'expiry': 1495987361, 'secure': False, 'value': '1495983761.518|0|lKyWZPLs%2FizLz8vTlbysQtasKFw%3D', 'name': '__jsl_clearance', 'path': '/'}]
        cookie = ""
        for r in res:
            cookie += (r['name'] + "=" + r["value"] + ";")
        return cookie

    # 关闭浏览器
    def close(self):
        self.browser.close()
예제 #4
0
 def __init__(self):
     self.header = Config().getHeader()
     self.logger = MyLog().getLogger()
     self.cnnvdDao = CnnvdDao()
예제 #5
0
class CnnvdSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.cnnvdDao = CnnvdDao()

    def spiderAll(self, ):
        start_url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
        urls = self.getUrls(start_url)
        for url in urls:
            data = self.getDetailData(url)
            self.cnnvdDao.insert(data)

    def getUrls(self, start_url):
        urls = []
        soup = SpiderUtil().getSoup(start_url)

        page = self.getTotalPage(soup)

        #倒序爬取,即从最后一页开始爬取
        for i in range(page):
            self.getDetailUrls(i, urls)
        return urls

    def getTotalPage(self, soup):
        # 获取总条数
        pageText = soup.find('div', class_='page').getText().split("\n")
        totalNum = 0
        for text in pageText:
            if text != '':
                totalNum = int(re.sub("\D", "", text))
                break
        if totalNum == 0:
            self.logger.error("getTotalNum Error")

        if totalNum % 10 != 0:
            page = int(totalNum / 10 + 1)
        else:
            page = int(totalNum / 10)
        return page

    def getDetailUrls(self, page, urls):
        url = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag?pageno=" + str(
            page)
        soup = SpiderUtil().getSoup(url)

        list_list = soup.find('div', class_='list_list')
        urlList = list_list.findAll('div', class_='f1')
        for u in urlList:
            urls.append(u.a['href'])

    def getDetailData(self, url):
        data = {}
        data['detailUrl'] = url
        soup = SpiderUtil().getSoup(url)
        details = soup.find('div', class_='detail_xq w770')
        data['chname'] = details.h2.getText()
        for li in details.ul:
            if type(li) == bs4.element.Tag:
                texts = re.sub("(\t|\n|\r|\040)*", "", li.getText()).split(":")
                if texts[0] in Config().getCnnvdVulList():
                    codeName = Config().getCnnvdVulList()[texts[0]]
                    data[codeName] = texts[1]
                    print(codeName + ": " + data[codeName])
        #漏洞简介
        vul_descriptions = soup.find('div', class_='d_ldjj').findAll(
            'p', style='text-indent:2em')
        data['vul_description'] = ''
        for vul_description in vul_descriptions:
            data['vul_description'] += re.sub("(\t|\n|\r|\040)*", "",
                                              vul_description.getText())
        #漏洞公告,参考网址,受影响实体
        contents = soup.findAll('div', class_='d_ldjj m_t_20')
        for content in contents:
            title = content.find('div', class_='title_bt').getText()
            title = re.sub("(\t|\n|\r|\040)*", "", title)
            if title in Config().getCnnvdVulList():
                codeName = Config().getCnnvdVulList()[title]
                data[codeName] = ''
                p = content.findAll('p', style='text-indent:2em')
                for x in p:
                    data[codeName] += re.sub("(\t|\n|\r|\040)*", "",
                                             x.getText())
        return data
예제 #6
0
 def __int__(self):
     self.logger = MyLog().getLogger()
예제 #7
0
 def __init__(self):
     self.header = Config().getHeader()
     self.logger = MyLog().getLogger()
     self.gongKongDao = GongKongDao()
예제 #8
0
class ChinaGKSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.gongKongDao = GongKongDao()

    def getUrls(self, page):
        urls = []
        start_url = "http://www.gongkong.com/select/QueryLink?pageIndex=" + str(
            page) + "&articleForm=productInfo"

        soup = SpiderUtil().getSoup(start_url)
        result = soup.findAll("div", class_="main_r_text")

        print(start_url, " 页面获取到的url个数: ", len(result))
        self.logger.info(start_url + " 页面获取到的url个数: " + str(len(result)))
        for i in result:
            urls.append(i.a['href'])
        return urls

    def getData(self, url):
        datas = {}
        datas['detailsLink'] = url
        detailsLink = url

        uuidURL = str(uuid.uuid1())

        # 此处进行查找url数据库,如果不存在则进行解析,否则则返回
        if (len(self.gongKongDao.getOne(detailsLink)) == 0):
            self.gongKongDao.insertURL(detailsLink, uuidURL)
            soup = SpiderUtil().getSoup(url)

            tab_text = soup.find("div", class_="tab_text")

            if (tab_text == None):
                datas['productName'] = soup.find(
                    'div', class_='product_title').h1.getText()
                table = soup.find('table', class_='dqfs1')
                for tr in table.children:
                    if type(tr) == bs4.element.Tag:
                        for td in tr.children:
                            if type(td) == bs4.element.Tag:
                                if td.string == '关键字:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas['keyWord'] = t.getText(
                                            ).strip().replace("\n", "&&")
                                            # print("关键字:" + keyWord)
                                elif td.string == '产品分类:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas[
                                                'produceCategory'] = t.getText(
                                                ).strip().replace(
                                                    "\040", "&&")
                                            # print("产品分类:" + produceCategory)
                                elif td.string == '品牌:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas['brand'] = re.sub(
                                                "(\t|\n|\r|\040)*", "",
                                                t.getText())
                                            # print("品牌:" + brand)
                datas['produceInfo'] = soup.find(
                    'dd',
                    style='overflow: auto; line-height: 22px;').getText()
                # print("产品简介: " + productInfo)
            else:
                # 当时只有title的时候将其title拆成品牌,产品名,分类
                # 如:'http://www.gongkong.com/ProductSeriesNew/Detail?id=31223&categoryId=808'
                for tab in tab_text.children:
                    if type(tab) == bs4.element.Tag:
                        te = re.sub("(\040)*", "", tab.getText())
                        tes = te.split(
                            "\xa0\xa0")  #\xa0是&nbsp的转义字符  使用repr()打印出来的
                        brand = tes[0]
                        productName = tes[1]
                        produceCategory = tes[2]
                        datas['brand'] = brand
                        datas['productName'] = productName
                        datas['produceCategory'] = produceCategory
                        print("产品名字:" + productName)
                        print("产品分类:" + produceCategory)
                        print("品牌:" + brand)
            self.gongKongDao.insertGongKong(datas, uuidURL)

    # 获取具体的型号
    def getDetail(self, url, id, type):
        detail = {}

        # 非编译器运行时
        # path = os.getcwd()
        # executable_path = path + "\\chormedirver.exe"
        # print(executable_path)
        # browser = webdriver.Chrome(executable_path)

        browser = webdriver.Chrome()
        browser.get(url)
        categroySelect = Select(browser.find_element_by_id("categorySelect_0"))
        categroy = categroySelect.options[id].text
        if (categroy != '-请选择-'):
            detail['categroy'] = categroy
            categroySelect.select_by_visible_text(categroy.strip())
            time.sleep(Config.getSleepTime())
            brandSelect = Select(browser.find_element_by_id("brandSelect_0"))
            for brand in brandSelect.options:
                if (brand.text != '-请选择-'):
                    #浏览器选择
                    detail['brand'] = brand.text
                    brandSelect.select_by_visible_text(brand.text)
                    time.sleep(Config.getSleepTime())
                    productSelect = Select(
                        browser.find_element_by_id("ProductSelect_0"))
                    for product in productSelect.options:
                        if (product.text != '-请选择-'):
                            detail['product'] = product.text
                            #浏览器选择
                            productSelect.select_by_visible_text(product.text)
                            time.sleep(Config.getSleepTime())
                            modelSelect = Select(
                                browser.find_element_by_id("modelSelect_0"))
                            # 存储全部选择
                            for model in modelSelect.options:
                                if (model.text != '-请选择-'):
                                    detail['model'] = model.text
                                    # details.append(detail)
                                    print("类别: " + detail['categroy'] +
                                          " 品牌:" + detail['brand'] + " 系列:" +
                                          detail['product'] + " 型号:" +
                                          detail['model'])
                                    self.gongKongDao.insertDetail(detail, type)
        browser.close()
예제 #9
0
 def __init__(self):
     self.header = Config().getHeader()
     self.logger = MyLog().getLogger()
     self.urlKeyWordDao = UrlKeyWordDao()
예제 #10
0
class KeyWordAnalyst():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.urlKeyWordDao = UrlKeyWordDao()

    def dataPreparation(self):
        dataCount = {}
        dataList = self.urlKeyWordDao.listURL()
        pattern = re.compile(Config().getFilterFile())
        for data in dataList:
            datas = data.url.split(".")
            for key in datas:
                match = pattern.match(key)
                if match == None and key != '':
                    if not dataCount.get(key):
                        dataCount[key] = 1
                    else:
                        dataCount[key] += 1
        sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True)
        for item in sort:
            print(item)
            self.logger.debug(item)

    def dataPreparationXXX(self):
        dataCount = {}
        dataList = self.urlKeyWordDao.listURL()
        pattern = re.compile(Config().getFilterFile2())
        for data in dataList:
            datas = data.url.split("/")
            for key in datas:
                match = pattern.match(key)
                if match == None and key != '':
                    if not dataCount.get(key):
                        dataCount[key] = 1
                    else:
                        dataCount[key] += 1
        sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True)
        for item in sort:
            print(item)
            self.logger.debug(item)

    def dataPreparationByEngine(self, engine):
        dataList = self.urlKeyWordDao.listURLByEngine(engine)
        self.xx(dataList)

    def dataPreparationBySearchWord(self, searchWord):
        dataList = self.urlKeyWordDao.listURLBySearchWord(searchWord)
        self.xx(dataList)

    def xx(self, dataList):
        dataCount = {}
        pattern = re.compile(Config().getFilterFile2())
        for data in dataList:
            datas = data.url.split("/")
            for key in datas:
                match = pattern.match(key)
                if match == None and key != '':
                    if not dataCount.get(key):
                        dataCount[key] = 1
                    else:
                        dataCount[key] += 1
        sort = sorted(dataCount.items(), key=lambda e: e[1], reverse=True)
        for item in sort:
            print(item)
            self.logger.debug(item)
 def __init__(self):
     self.DBSession = sqlalchemy.orm.sessionmaker(bind = MysqlConfig().getEngine())
     self.session = self.DBSession()
     MysqlConfig().getBaseModel().metadata.create_all(MysqlConfig().getEngine())
     self.logger = MyLog().getLogger()
예제 #12
0
class CNVDSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.cnvdDao = CNVDDao()

    def getUrls(self, num):
        urls = []
        start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num)
        self.logger.info("开始页面:{}".format(start_url))
        soup = SpiderUtil().getSoup(start_url)

        results = soup.find_all('td',
                                style="text-align:left;padding-left:10px;")
        self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results)))
        for result in results:
            urls.append(result.a['href'])

        return urls

    def getData(self, url):
        soup = SpiderUtil().getSoupByWebDriver(url)

        print(url)
        chname = soup.find("div", class_="blkContainerSblk").h1.getText()
        messageResult = {}
        messageResult['chname'] = chname

        tbody = soup.find("table", class_="gg_detail").tbody

        TRlist = tbody.find_all('tr')
        for trlist in TRlist[:-1]:
            if trlist.td.string == "影响产品":
                impact_productSum = ''
                if "影响产品" not in messageResult:
                    messageResult["impact_product"] = []
                for td in trlist.td.next_siblings:
                    if type(td) == bs4.element.Tag:
                        for k in td:
                            impact_product = ''
                            if type(k) == bs4.element.Tag:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.getText())
                            else:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.string)
                            if impact_product != "":
                                if impact_productSum == '':
                                    impact_productSum = impact_product
                                else:
                                    impact_productSum = impact_productSum + ',' + impact_product

                messageResult['impact_product'].append(impact_productSum)
            else:
                name = trlist.td.string
                if name in Config().getCnvdVulList():
                    codename = Config().getCnvdVulList()[name]
                    for td in trlist.td.next_siblings:
                        if type(td) == bs4.element.Tag:
                            tdText = re.sub(r"(\r|\t|\n|\040)*", "",
                                            td.getText())
                            if len(tdText):
                                if codename in messageResult:
                                    messageResult[codename].append(tdText)
                                else:
                                    messageResult[codename] = tdText
                else:
                    self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format(
                        url, chname, name))

        for name in Config().getCnvdVulList():
            if Config().getCnvdVulList()[name] not in messageResult:
                messageResult[Config().getCnvdVulList()[name]] = NULL
        self.cnvdDao.insert(messageResult)

    # 判断是否是已经爬过的信息
    # 即判断cnvd-id是否存在
    def isExist(self, cnvd_id):
        list = self.cnvdDao.selectByCNVDId(cnvd_id)
        if len(list) == 1:
            return True  # 表示存在该条信息
        elif len(list) == 0:
            return False  # 表示不存在该条信息
        else:
            self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format(
                cnvd_id, list))
            return

    def getPageNum(self):
        soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/")
        step = soup.find_all("a", class_="step")
        pageNum = step[len(step) - 1].get_text()
        return int(pageNum)

    # 爬取全部信息
    def spiderAll(self):
        pageNum = self.getPageNum()
        # 从最后一页开始爬取
        for i in range(pageNum)[::-1]:
            urls = self.getUrls(i * 20)

            for url in urls[::-1]:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该漏洞信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))

    # 更新数据
    def update(self):
        pageNum = self.getPageNum()
        # 从第一页开始更新数据
        for i in range(pageNum):
            urls = self.getUrls(i * 20)
            for url in urls:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))
                elif self.isExist(cnvdId) == True:
                    return  # 存在该信息 则退出


# 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据
예제 #13
0
class UrlKeyWordSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.urlKeyWordDao = UrlKeyWordDao()

    def getUrlsByBaidu(self, keyWord):
        datas = []
        # 对url中中文处理
        url = "/s?wd=" + urllib.parse.quote(keyWord)
        self.getByBaidu(url, keyWord, datas)
        self.urlKeyWordDao.insert(datas)

    def getUrlsByGoogle(self, keyWord):
        datas = []
        # 对url中中文处理
        start_url = "https://www.google.com.hk/search?q=" + urllib.parse.quote(
            keyWord)
        browser = webdriver.Chrome()
        browser.get(start_url)
        while (self.isElementExist(browser, '下一页')):
            browser.find_element_by_link_text('下一页').click()
            soup = BeautifulSoup(browser.page_source,
                                 "html.parser",
                                 from_encoding='UTF-8')
            self.getDataByGoogle(soup, keyWord, datas)
            time.sleep(Config().getSleepTime())
        browser.close()
        self.urlKeyWordDao.insert(datas)

    def getDataByGoogle(self, soup, keyWord, datas):
        results = soup.findAll('div', class_='rc')
        for result in results:
            try:
                data = {}
                data['url'] = result.find('cite', class_='_Rm').getText()
                data['urlTitle'] = result.h3.getText()
                data['searchEngine'] = "Google"
                data['searchWord'] = keyWord
                datas.append(data)
            except Exception as e:
                self.logger.error("getData获取数据错误:[error]:{}……result:{}".format(
                    e,
                    str(result).replace(u'\xa0', u' ')))

    def getByBaidu(self, url, keyWord, datas):
        url = "https://www.baidu.com" + url
        soup = SpiderUtil().getSoup(url)

        self.getDataByBaidu(soup, keyWord, datas)

        nextUrl = self.getNextPageUrl(soup)

        if nextUrl != -1:
            self.getByBaidu(nextUrl, keyWord, datas)

    def getDataByBaidu(self, soup, keyWord, datas):
        results = soup.findAll('div', class_="f13")
        for result in results:
            try:
                data = {}
                data['url'] = result.a.getText()
                data['urlTitle'] = result.div['data-tools']
                data['searchEngine'] = "百度"
                data['searchWord'] = keyWord
                datas.append(data)
            except Exception as e:
                self.logger.error("getData获取数据错误:[error]:{}……result:{}".format(
                    e,
                    str(result).replace(u'\xa0', u' ')))

    def getNextPageUrl(self, soup):
        nextUrls = soup.find('div', id='page').findAll('a')
        if nextUrls == None:
            return -1
        if len(nextUrls) <= 0:
            return -1
        if nextUrls[len(nextUrls) - 1].getText() != '下一页>':
            return -1
        return nextUrls[len(nextUrls) - 1]['href']

    #判断是否存在此标签
    def isElementExist(self, browser, element):
        flag = True
        try:
            browser.find_element_by_link_text(element)
        except:
            flag = False
        return flag