예제 #1
0
class ChinaGKSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.gongKongDao = GongKongDao()

    def getUrls(self, page):
        urls = []
        start_url = "http://www.gongkong.com/select/QueryLink?pageIndex=" + str(
            page) + "&articleForm=productInfo"

        soup = SpiderUtil().getSoup(start_url)
        result = soup.findAll("div", class_="main_r_text")

        print(start_url, " 页面获取到的url个数: ", len(result))
        self.logger.info(start_url + " 页面获取到的url个数: " + str(len(result)))
        for i in result:
            urls.append(i.a['href'])
        return urls

    def getData(self, url):
        datas = {}
        datas['detailsLink'] = url
        detailsLink = url

        uuidURL = str(uuid.uuid1())

        # 此处进行查找url数据库,如果不存在则进行解析,否则则返回
        if (len(self.gongKongDao.getOne(detailsLink)) == 0):
            self.gongKongDao.insertURL(detailsLink, uuidURL)
            soup = SpiderUtil().getSoup(url)

            tab_text = soup.find("div", class_="tab_text")

            if (tab_text == None):
                datas['productName'] = soup.find(
                    'div', class_='product_title').h1.getText()
                table = soup.find('table', class_='dqfs1')
                for tr in table.children:
                    if type(tr) == bs4.element.Tag:
                        for td in tr.children:
                            if type(td) == bs4.element.Tag:
                                if td.string == '关键字:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas['keyWord'] = t.getText(
                                            ).strip().replace("\n", "&&")
                                            # print("关键字:" + keyWord)
                                elif td.string == '产品分类:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas[
                                                'produceCategory'] = t.getText(
                                                ).strip().replace(
                                                    "\040", "&&")
                                            # print("产品分类:" + produceCategory)
                                elif td.string == '品牌:':
                                    for t in td.next_siblings:
                                        if type(t) == bs4.element.Tag:
                                            datas['brand'] = re.sub(
                                                "(\t|\n|\r|\040)*", "",
                                                t.getText())
                                            # print("品牌:" + brand)
                datas['produceInfo'] = soup.find(
                    'dd',
                    style='overflow: auto; line-height: 22px;').getText()
                # print("产品简介: " + productInfo)
            else:
                # 当时只有title的时候将其title拆成品牌,产品名,分类
                # 如:'http://www.gongkong.com/ProductSeriesNew/Detail?id=31223&categoryId=808'
                for tab in tab_text.children:
                    if type(tab) == bs4.element.Tag:
                        te = re.sub("(\040)*", "", tab.getText())
                        tes = te.split(
                            "\xa0\xa0")  #\xa0是&nbsp的转义字符  使用repr()打印出来的
                        brand = tes[0]
                        productName = tes[1]
                        produceCategory = tes[2]
                        datas['brand'] = brand
                        datas['productName'] = productName
                        datas['produceCategory'] = produceCategory
                        print("产品名字:" + productName)
                        print("产品分类:" + produceCategory)
                        print("品牌:" + brand)
            self.gongKongDao.insertGongKong(datas, uuidURL)

    # 获取具体的型号
    def getDetail(self, url, id, type):
        detail = {}

        # 非编译器运行时
        # path = os.getcwd()
        # executable_path = path + "\\chormedirver.exe"
        # print(executable_path)
        # browser = webdriver.Chrome(executable_path)

        browser = webdriver.Chrome()
        browser.get(url)
        categroySelect = Select(browser.find_element_by_id("categorySelect_0"))
        categroy = categroySelect.options[id].text
        if (categroy != '-请选择-'):
            detail['categroy'] = categroy
            categroySelect.select_by_visible_text(categroy.strip())
            time.sleep(Config.getSleepTime())
            brandSelect = Select(browser.find_element_by_id("brandSelect_0"))
            for brand in brandSelect.options:
                if (brand.text != '-请选择-'):
                    #浏览器选择
                    detail['brand'] = brand.text
                    brandSelect.select_by_visible_text(brand.text)
                    time.sleep(Config.getSleepTime())
                    productSelect = Select(
                        browser.find_element_by_id("ProductSelect_0"))
                    for product in productSelect.options:
                        if (product.text != '-请选择-'):
                            detail['product'] = product.text
                            #浏览器选择
                            productSelect.select_by_visible_text(product.text)
                            time.sleep(Config.getSleepTime())
                            modelSelect = Select(
                                browser.find_element_by_id("modelSelect_0"))
                            # 存储全部选择
                            for model in modelSelect.options:
                                if (model.text != '-请选择-'):
                                    detail['model'] = model.text
                                    # details.append(detail)
                                    print("类别: " + detail['categroy'] +
                                          " 品牌:" + detail['brand'] + " 系列:" +
                                          detail['product'] + " 型号:" +
                                          detail['model'])
                                    self.gongKongDao.insertDetail(detail, type)
        browser.close()
예제 #2
0
class CNVDSpider():
    def __init__(self):
        self.header = Config().getHeader()
        self.logger = MyLog().getLogger()
        self.cnvdDao = CNVDDao()

    def getUrls(self, num):
        urls = []
        start_url = "http://ics.cnvd.org.cn/?max=20&offset=" + str(num)
        self.logger.info("开始页面:{}".format(start_url))
        soup = SpiderUtil().getSoup(start_url)

        results = soup.find_all('td',
                                style="text-align:left;padding-left:10px;")
        self.logger.info("{} 页面获取到的url个数:{}".format(start_url, len(results)))
        for result in results:
            urls.append(result.a['href'])

        return urls

    def getData(self, url):
        soup = SpiderUtil().getSoupByWebDriver(url)

        print(url)
        chname = soup.find("div", class_="blkContainerSblk").h1.getText()
        messageResult = {}
        messageResult['chname'] = chname

        tbody = soup.find("table", class_="gg_detail").tbody

        TRlist = tbody.find_all('tr')
        for trlist in TRlist[:-1]:
            if trlist.td.string == "影响产品":
                impact_productSum = ''
                if "影响产品" not in messageResult:
                    messageResult["impact_product"] = []
                for td in trlist.td.next_siblings:
                    if type(td) == bs4.element.Tag:
                        for k in td:
                            impact_product = ''
                            if type(k) == bs4.element.Tag:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.getText())
                            else:
                                impact_product = re.sub(
                                    "(\t|\n|\r|\040)*", "", k.string)
                            if impact_product != "":
                                if impact_productSum == '':
                                    impact_productSum = impact_product
                                else:
                                    impact_productSum = impact_productSum + ',' + impact_product

                messageResult['impact_product'].append(impact_productSum)
            else:
                name = trlist.td.string
                if name in Config().getCnvdVulList():
                    codename = Config().getCnvdVulList()[name]
                    for td in trlist.td.next_siblings:
                        if type(td) == bs4.element.Tag:
                            tdText = re.sub(r"(\r|\t|\n|\040)*", "",
                                            td.getText())
                            if len(tdText):
                                if codename in messageResult:
                                    messageResult[codename].append(tdText)
                                else:
                                    messageResult[codename] = tdText
                else:
                    self.logger.warning("url:{}, Chname:{}。 未收入的标签:{}".format(
                        url, chname, name))

        for name in Config().getCnvdVulList():
            if Config().getCnvdVulList()[name] not in messageResult:
                messageResult[Config().getCnvdVulList()[name]] = NULL
        self.cnvdDao.insert(messageResult)

    # 判断是否是已经爬过的信息
    # 即判断cnvd-id是否存在
    def isExist(self, cnvd_id):
        list = self.cnvdDao.selectByCNVDId(cnvd_id)
        if len(list) == 1:
            return True  # 表示存在该条信息
        elif len(list) == 0:
            return False  # 表示不存在该条信息
        else:
            self.logger.error("查询出错:cnvd_id:{}, [ERROR]:list:{}".format(
                cnvd_id, list))
            return

    def getPageNum(self):
        soup = SpiderUtil().getSoupByWebDriver("http://ics.cnvd.org.cn/")
        step = soup.find_all("a", class_="step")
        pageNum = step[len(step) - 1].get_text()
        return int(pageNum)

    # 爬取全部信息
    def spiderAll(self):
        pageNum = self.getPageNum()
        # 从最后一页开始爬取
        for i in range(pageNum)[::-1]:
            urls = self.getUrls(i * 20)

            for url in urls[::-1]:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该漏洞信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))

    # 更新数据
    def update(self):
        pageNum = self.getPageNum()
        # 从第一页开始更新数据
        for i in range(pageNum):
            urls = self.getUrls(i * 20)
            for url in urls:
                u = url.split("/")
                cnvdId = u[len(u) - 1]
                if self.isExist(cnvdId) == False:
                    try:
                        self.getData(url)  # 不存在该信息则获取并插入
                    except Exception as excep:
                        self.logger.error("getDataError{}".format(excep))
                elif self.isExist(cnvdId) == True:
                    return  # 存在该信息 则退出


# 存在的问题:如果在未更新完的情况下程序被终止时才重新运行更新数据 这会丢失中间的一些数据