Python getSoup Examples, hp.getSoup Python Examples

Example #1

0

Show file

File: shubao2.py Project: wean/PersimmonCakes

 def download(self):
     self.soup = hp.getSoup(self.url)
     if self.soup == None:
         print("download book err %s"%(url))
     else:
         options = self.soup.select(".option a")
         for option in options:
             if "下载" in option.text:
                 self.downloadSoup = hp.getSoup(option.attrs["href"])
                 if self.downloadSoup != None:
                     for lk in self.downloadSoup.select(".main .downlink a"):
                         if hp.downloadFile(lk.attrs["href"], self.dirPath, self.url):
                             return

Example #2

0

Show file

File: sbf.py Project: wean/PersimmonCakes

 def parse(self):
     soup = hp.getSoup(self.baseUrl+self.clsUrl)
     if soup == None:
         return
     links = soup.select("html #content a")
     for link in links:
         if link.attrs["href"][0:4] == "/p2p":
             ClsPage(self.baseUrl, link.attrs["href"]).parse()

Example #3

0

Show file

File: shubao2.py Project: wean/PersimmonCakes

 def getBookList(self):
     
     while True:
         if len(self.url)==0:
             break
         geturl = self.url[0]
         del(self.url[0])
         self.soup = hp.getSoup(geturl)
         if self.soup == None:
             print("get book list err %s"%(geturl))
         else:
             for lk in self.soup.select("html body .name a"):
                 self.downloadBook(lk.text, lk.attrs["href"])
             pagelinks = self.soup.select("html body .pages a")
             print(pagelinks)
             if len(pagelinks) > 2 and pagelinks[-2].text == ">":
                 self.url.append(pagelinks[-2].attrs["href"])

Example #4

0

Show file

File: sbf.py Project: wean/PersimmonCakes

 def loadCls(self):
     self.baseUrls = []
     self.clsUrls = []
     f = open("99.url")
     for url in f.readlines():
         self.baseUrls.append(url.strip("\n"))
     f.close()
     for url in self.baseUrls:
         text = hp.getText(url+"/hgc1.js")
         urls = []
         if text != None:
             urls = re.findall("<a href=(.*) target=_blank>", text)
         if len(urls) == 0:
             text = hp.getText(url+"/sgc2.js")
             if text == None:
                 continue
             urls = re.findall("src=\\\\\"(.*)/hgc1.js\\\\\">", text)
         print(urls)
         for newUrl in urls:
             newUrl = newUrl.replace("\\/", "/")
             if (newUrl in self.baseUrls) == False:
                 self.baseUrls.append(newUrl)
     
     for url in self.baseUrls:
         soup = hp.getSoup(url)
         if soup == None:
             self.baseUrls.remove(url)
             continue
         clsLinks = soup.select("html body div table tr a")
         for clsLink in clsLinks:
             if clsLink.attrs["href"][0:2].lower() == "00":
                 self.clsUrls.append(clsLink.attrs["href"])
         if len(self.clsUrls) > 0:
             self.baseUrl = url
             break
                 
     f = open('99.url', 'w')
     for url in self.baseUrls:
         f.write(url+"\n")
     f.close()

Example #5

0

Show file

File: sbf.py Project: wean/PersimmonCakes

    def parse(self):
        self.fileName = os.path.split(self.pageUrl)[1]
        self.dirName = os.path.splitext(self.fileName)[0]
        self.dirPath = os.path.join(os.path.abspath("."), "sb", self.dirName)
        if len(glob.glob(self.dirPath + "/*")) > 0:
            return
        if os.path.exists(self.dirPath) == False:
            os.makedirs(self.dirPath)
        soup = hp.getSoup(self.baseUrl+self.pageUrl)
        if soup == None:
            return
        f = open(os.path.join(self.dirPath, "README"), "w")
        f.write(str(soup))
        f.close()
        contents = soup.select("#content")
        if len(contents) == 0:
            return
        content = contents[0]
        self.nextDir()
        self.cnts = []
        self.collectContent(content.contents)

        for cnt in self.cnts:
            strContent=""
            if cnt == None:
                continue
            elif isinstance(cnt, bs4.element.Tag):
                if cnt.name.lower() == "img":
                    if self.dirPicPath != None:
                        f = open(self.dirPicPath, "a")
                        f.write(cnt.attrs["src"] + "\n")
                        f.close()
                elif cnt.name.lower() == "a":
                    if len(cnt.contents) == 1 and isinstance(cnt.contents[0], bs4.element.Tag) and cnt.contents[0].name == "img":
                        if self.dirPicPath != None:
                            f = open(self.dirPicPath, "a")
                            f.write(cnt.contents[0].attrs["src"] + "\n")
                            f.close()
                    else:
                        if self.dirLinkPath != None:
                            self.hasLink = True
                            f = open(self.dirLinkPath, "a")
                            f.write(cnt.attrs["href"] + "\n")
                            f.close()
                else:
                    strContent=cnt.text
            elif isinstance(cnt, bs4.element.NavigableString):
                strContent=str(cnt)

            if strContent != None:
                strContent = strContent.strip()
            if strContent!="":
                if self.hasLink == True:
                    strToNextA=""
                    nxtCnt = cnt
                    while True:
                        if nxtCnt == None:
                            break
                        if isinstance(nxtCnt, bs4.element.Tag):
                            if nxtCnt.name.lower() == "a":
                                if len(nxtCnt.contents) == 1 and isinstance(nxtCnt.contents[0], bs4.element.Tag) and nxtCnt.contents[0].name == "img":
                                    pass
                                else:
                                    break
                            elif nxtCnt.name.lower() == "img":
                                pass
                            else:
                                strToNextA += nxtCnt.text
                        elif isinstance(nxtCnt, bs4.element.NavigableString):
                            strToNextA += str(nxtCnt)
                        nxtCnt = nxtCnt.next
                    if len(strToNextA) > 50:
                        self.nextDir()

                if len(strContent) > 8:
                    isSharp = True
                    for s in strContent:
                        if s != '=':
                            isSharp = False
                            break
                    if isSharp == True:
                        self.nextDir()
                        continue
                
                    isMinus = True
                    for s in strContent:
                        if s != '-':
                            isMinus = False
                            break
                    if isMinus == True:
                        self.nextDir()
                        continue

                if self.dirName == None:
                    self.dirName = strContent.replace("/", "")
                    if len(self.dirName) > 30:
                        self.dirName = self.dirName[0:30]
                    self.dirDirPath = os.path.join(self.dirPath, self.dirName)
                    if os.path.exists(self.dirDirPath) == False:
                        os.makedirs(self.dirDirPath)
                    self.dirReadMePath = os.path.join(self.dirDirPath, "README")
                    self.dirPicPath = os.path.join(self.dirDirPath, "PICURL")
                    self.dirLinkPath = os.path.join(self.dirDirPath, "LINKURL")
                if self.dirReadMePath != None:
                    f = open(self.dirReadMePath, "a")
                    f.write(strContent + "\n")
                    f.close()

Example #6

0

Show file

File: shubao2.py Project: wean/PersimmonCakes

 def getClsList(self):
     return hp.getSoup("http://www.shubao2.com").select("html body .daohang ul li a")