Ejemplo n.º 1
0
 def get_year_of_nianjian(self):
     req = urllib2.Request(self.url, None, self.req_header)
     response = urllib2.urlopen(req)
     rawdata = myutils.ungzip(response)
     year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a"))
     i = 0
     while i < year_num:
         j = 0
         while j < 5 and i < year_num:
             li = PyQuery(
                 PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i])
             folder = myutils.filenameCheck(li.text())
             folder = os.path.join(self.root, folder)
             try:
                 os.mkdir(folder)
             except Exception, e:
                 print "%s created error" % (folder)
                 i = i + 1
                 j = j + 1
             else:
                 href = "http://tongji.cnki.net/kns55/Navi/" + li.attr(
                     "href")
                 i = i + 1
                 j = j + 1
                 self.threads.append(
                     YearPage(os.path.join(self.root, li.text()), href))
         for t in self.threads:
             if not t.isAlive():
                 t.start()
         t.join()
         self.threads = []
Ejemplo n.º 2
0
    def download_result(self,fileHref):
        url = "http://tongji.cnki.net/kns55" + fileHref.strip("..")
        req = urllib2.Request(url=url,data=None,headers=self.req_header)
        
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar))
        opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36')]
        try:
            response = opener.open(req)
            if response.info().getheader("Content-Type").find("text/html") >=0: 
                data = myutils.ungzip(response).decode('utf-8')
                if data.find(u'用户登录') > 0 and self.failTimes < 3:
                    myutils.get_cookieJar()
                    self.cookieJar = myutils.get_cookieJar()
                    self.failTimes = self.failTimes + 1
#                     response = opener.open(req)
#                     self.save_response(response)
                    self.download_result(fileHref)
                elif data.find(u'用户登录') > 0 and self.failTimes >= 3:
                    self.failTimes = 0
                    return
        #     for ck in g_cookieJar:
        #         print '%s,%s' % (ck.name,ck.value)
            if response.info().getheader("Content-Type").find("pdf")>=0 or response.info().getheader("Content-Type").find("octet-stream")>=0:
#                 cj = copy.copy(self.cookieJar)
#                 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler)
#                 response = opener.open(req)
                self.save_response(response)
            time.sleep(1)
        except Exception,e:
#             print e.msg.decode('gbk')
            f = open(os.path.join(self.root,"download_error.txt"),'a')
            f.write('%s---%s---%s---%s\n' % (self.root , self.pfolder ,self.filename  , url ))
            f.close()
Ejemplo n.º 3
0
 def get_year_of_nianjian(self):
     req = urllib2.Request(self.url,None,self.req_header)
     response = urllib2.urlopen(req)
     rawdata= myutils.ungzip(response)
     year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a"))
     i = 0 
     while i < year_num:
         j = 0
         while j < 5 and i < year_num:
             li = PyQuery(PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i])
             folder = myutils.filenameCheck(li.text())
             folder = os.path.join(self.root,folder)
             try:
                 os.mkdir(folder)
             except Exception,e:
                 print "%s created error" %(folder)
                 i = i + 1
                 j = j + 1
             else:
                 href = "http://tongji.cnki.net/kns55/Navi/" + li.attr("href")
                 i = i + 1
                 j = j + 1
                 self.threads.append(YearPage(os.path.join(self.root,li.text()),href))
         for t in self.threads:
             if not t.isAlive():
                 t.start()
         t.join()
         self.threads = []
Ejemplo n.º 4
0
 def run(self):
     self.parse_yearbook_page()
     
     
 def stop(self):
     self.thread_
 def parse_yearbook_page(self):
     req = urllib2.Request(self.url,None,self.req_header)
     try:
         response = urllib2.urlopen(req)
     except Exception,e:
         f = open(os.path.join(self.root,"year_error.txt"),'a')
         f.write('%s---%s\n' % (self.root,self.url))
         f.close()
     else:
         rawdata= myutils.ungzip(response)
 #         print rawdata
         pquery = PyQuery(rawdata.decode('utf-8'))
         for li in pquery(".TreeList li"):
 
             self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text())
             
             while os.path.exists(os.path.join(self.root,self.pfolder)):
                 self.pfolder = self.pfolder + "_2"
             try:
                 os.mkdir(os.path.join(self.root,self.pfolder))
             except Exception,e:
                 print "%s created error" %(os.path.join(self.root,self.pfolder))
             else:
                 strParam = PyQuery(li)("a").attr('onclick')
                 aParam = strParam.split('(')[1].strip(')').split(',')