Esempio n. 1
0
def get_page_base(url, headers=HttpHeader.LoginHeader()):
    resp = requests.get(url=url, headers=headers)

    if str(resp.status_code)[0] == '2':
        log.info('下载成功,状态为%s,url:%s:。' % (resp.status_code, url))
        return resp
    else:
        log.warning('下载不成功,状态为%s,url:%s:。' % (resp.status_code, url))
        return resp
Esempio n. 2
0
 def run_down(self, resp):
     self.get_page_url(resp)
     for pageNum, url in self.page_url.items():
         log.info('正在下载文档(%s)第%s页,url:%s' %
                  (self.document.id, pageNum, url))
         self.down_page(url)
         log.info('文档(%s)第%s页下载完成,url:%s' %
                  (self.document.id, pageNum, url))
         time.sleep(TIME_WITE)
Esempio n. 3
0
 def run(self):
     self.get_doc_info()
     log.info(
         '你下载的文档ID为:%s,文档标题为:%s,文档类型为:%s' %
         (self.document.id, self.document.title, self.document.docType))
     ws = self.WkInfo_docType[self.document.docTypeNum]
     if ws:
         pag = ws(self.document)
         pag.run_down(self.resp)
     else:
         log.info('你的文档类型不支持下载,文档类型为:%s' % self.document.docType)
Esempio n. 4
0
    def get_page_url(self, resp):

        page = resp.content.decode('gbk')
        self.page_url = {
            x: y.replace('\\', '').replace('//wk/', '/wk/')
            for x, y in re.findall(
                r'[,\[]{.{10,30}:(\d{1,3}).{10,30}(https:.*?\.json\?.*?token.*?)\\x22}',
                page)
        }
        log.info('文档ID:%s,完成下载网址获取,共%s个链接。' %
                 (self.document.id, self.page_url.__len__()))
Esempio n. 5
0
    def down_page(self,url):

        resp = self.get_page_base(url, HttpHeader.TextHeader())

        cont = resp.content.decode('unicode_escape', 'ignore').replace('\r\n','')

        datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont)

        for data,index in datas:
            # 调用 filesave 实例处理文档内容保存
            filesave.Save(data, self.document)
            log.info('文档(%s)正在写入第%s页' % (self.docId, index))
Esempio n. 6
0
    def get_page_url(self,page):
        getdocinfo_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=%s' % self.docId

        resp = self.get_page_base(getdocinfo_url,HttpHeader.LoginHeader())

        content = resp.content.decode('utf-8')

        # 获取文档内容url参数
        md5 = re.findall('"md5sum":"(.*?)"',content)[0]
        pn = re.findall('"totalPageNum":"(.*?)"', content)[0]
        rsign = re.findall('"rsign":"(.*?)"', content)[0]
        # 合成url
        content_url = 'https://wkretype.bdimg.com/retype/text/' + self.docId + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign
        self.page_url = {'1': content_url}
        log.info('文档ID(%s)下载网址获取完成,url:%s。' % (self.docId, content_url))
Esempio n. 7
0
 def make_path(self):
     if not os.path.isdir(self.fp):
         os.makedirs(self.fp)
         log.info('创建目录%s' % self.fp)
Esempio n. 8
0
 def run_down(self):
     filesave.Save(json.dumps(self.document.__dict__,ensure_ascii=False),self.document)
     log.info('文档(%s)正在下载,url:%s' % (self.docId, self.page_url['1'] ))
     self.down_page(self.page_url['1'])
     log.info('文档(%s)下载完成,url:%s' % (self.docId, self.page_url['1']))
Esempio n. 9
0
        datas = re.findall('{"c":"(.*?)".*?"}.*?(\d+).*?\d+}', cont)

        for data,index in datas:
            # 调用 filesave 实例处理文档内容保存
            filesave.Save(data, self.document)
            log.info('文档(%s)正在写入第%s页' % (self.docId, index))

    def run_down(self):
        filesave.Save(json.dumps(self.document.__dict__,ensure_ascii=False),self.document)
        log.info('文档(%s)正在下载,url:%s' % (self.docId, self.page_url['1'] ))
        self.down_page(self.page_url['1'])
        log.info('文档(%s)下载完成,url:%s' % (self.docId, self.page_url['1']))



if __name__ == '__main__':


    # url = 'https://wenku.baidu.com/view/c55db74626d3240c844769eae009581b6bd9bd1d.html'
    # url = 'https://wenku.baidu.com/view/8c8d30a9bb0d4a7302768e9951e79b8968026891.html?from=search'
    # TXT
    url = 'https://wenku.baidu.com/view/f22deb61d1f34693dbef3e8b.html?from=search'
    docm = txtSpider(url)
    docm.get_doc_info()
    docm.run_down()
    for x,y in docm.document.__dict__.items():
        print(x,':',y)
    log.info(url+' 下载完成')


Esempio n. 10
0
 def run_down(self):
     for pageNum, url in self.page_url.items():
         log.info('正在下载文档(%s)第%s页,url:%s' % (self.docId, pageNum, url))
         self.down_page(url)
         log.info('文档(%s)第%s页下载完成,url:%s' % (self.docId, pageNum, url))
         time.sleep(TIME_WITE)