Beispiel #1
0
 def scratch(self, url):
     try:
         req = urllib.request.Request(url)
         result = urllib.request.urlopen(req).read()
         html = result.decode("utf-8", 'ignore')
         return html
     except Exception as e:
         logutil.log("CnbetaSpider", e)
         return None
Beispiel #2
0
    def fetchItemList(self):
        htmlSoup = BeautifulSoup(self.mainHtml, "html.parser")
        listDiv = htmlSoup.find(attrs={"class": "alllist"}).find(attrs={"class": "items_area"})
        itemRawList = listDiv.find_all(attrs={"class": "item"})
        for item in itemRawList:
            try:
                article = CnbetaArticle()
                titleTag = item.find(attrs={"class": "title"}).find("a")
                contentTag = item.find("span", attrs={"class": "newsinfo"}).find("p")

                article.title = "".join(titleTag.contents)
                article.url = self.targetMainUrl + titleTag["href"]
                article.cover = item.find("div", attrs={"class": "pic"}).find("a").find("img")['src']
                briefStr = contentTag.renderContents().decode("utf-8")

                article.brief = briefStr
                self.itemList.append(article)
            except Exception as e:
                # print(e)
                pass

        logutil.log("CnbetaSpider", "getItemList finished")