Python getHTML Exemples, spider.getHTML Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : core.py Projet : binaryer/documentDownloader

    def __getNextPage(self, imgUrl):
        url = makeURL(
            'https://' + self.domain + '.book118.com/PW/GetPage/?', {
                'f': self.pdfInfo['Url'],
                'img': imgUrl,
                'isMobile': 'false',
                'isNet': 'True',
                'readLimit': self.pdfInfo['ReadLimit'],
                'furl': self.pdfInfo['Furl']
            })

        result = ''
        while True:
            result = getHTML(url)
            if (result.startswith('{') and result.endswith('}')):
                break
            else:
                print(result, file=sys.stderr)

        res = json.loads(result)

        if self.total == 0:
            self.total = res['PageCount']
        self.index = res['PageIndex']
        self.imgList.append(res['NextPage'])

        print(self.index, '/', self.total, 'url finish', res['NextPage'])

        return res

Exemple #2

0

Afficher le fichier

Fichier : core.py Projet : binaryer/documentDownloader

 def __getPdfInfo(self):
     url = makeURL(
         'https://max.book118.com/index.php?', {
             'g': 'Home',
             'm': 'View',
             'a': 'viewUrl',
             'cid': str(self.pid),
             'flag': '1'
         })
     viewPage = getHTML(url)
     self.domain = re.findall(r'//(.*?)\..*', viewPage)[0]
     rawHTML = getHTML('https:' + viewPage)
     res = re.findall(r'<input type="hidden" id="(.*?)" value="(.*?)".*?/>',
                      rawHTML)
     for lst in res:
         self.pdfInfo[lst[0]] = lst[1]

Exemple #3

0

Afficher le fichier

    def __getIMG(self):
        if os.path.exists('./temp'):
            shutil.rmtree('./temp')
        os.makedirs('./temp')

        for (idx, img) in enumerate(self.imgList):
            res = getHTML(
                makeURL('http://' + self.domain + '.book118.com/img/?', {'img': img}), byte=True)
            with open('./temp/' + str(idx + 1) + '.jpg', 'wb') as f:
                f.write(res)
            print(idx + 1, '/', self.total,
                  'download finish', str(idx + 1) + '.jpg')
            self.imgFileList.append('./temp/' + str(idx + 1) + '.jpg')
        # ?img=Hs92T42xAvsP_ycWPqjcj8Iw69WUDaxvq4HtxAb3Zl3WYzxX1hdIsZzydhmmGAtm
        pass

Exemple #4

0

Afficher le fichier

Fichier : core.py Projet : vsmawoex/documentDownloader

    def __getNextPage(self, imgUrl):
        url = makeURL(
            'https://' + self.domain + '.book118.com/pdf/GetNextPage/?', {
                'f': self.pdfInfo['Url'],
                'img': imgUrl,
                'isMobile': 'false',
                'isNet': 'True',
                'readLimit': self.pdfInfo['ReadLimit'],
                'furl': self.pdfInfo['Furl']
            })
        result = getHTML(url)
        res = json.loads(result)

        if self.total == 0:
            self.total = res['PageCount']
        self.index = res['PageIndex']
        self.imgList.append(res['NextPage'])

        print(self.index, '/', self.total, 'url finish', res['NextPage'])

        return res