Beispiel #1
0
 def get_year_of_nianjian(self):
     req = urllib2.Request(self.url, None, self.req_header)
     response = urllib2.urlopen(req)
     rawdata = myutils.ungzip(response)
     year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a"))
     i = 0
     while i < year_num:
         j = 0
         while j < 5 and i < year_num:
             li = PyQuery(
                 PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i])
             folder = myutils.filenameCheck(li.text())
             folder = os.path.join(self.root, folder)
             try:
                 os.mkdir(folder)
             except Exception, e:
                 print "%s created error" % (folder)
                 i = i + 1
                 j = j + 1
             else:
                 href = "http://tongji.cnki.net/kns55/Navi/" + li.attr(
                     "href")
                 i = i + 1
                 j = j + 1
                 self.threads.append(
                     YearPage(os.path.join(self.root, li.text()), href))
         for t in self.threads:
             if not t.isAlive():
                 t.start()
         t.join()
         self.threads = []
Beispiel #2
0
 def get_year_of_nianjian(self):
     req = urllib2.Request(self.url,None,self.req_header)
     response = urllib2.urlopen(req)
     rawdata= myutils.ungzip(response)
     year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a"))
     i = 0 
     while i < year_num:
         j = 0
         while j < 5 and i < year_num:
             li = PyQuery(PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i])
             folder = myutils.filenameCheck(li.text())
             folder = os.path.join(self.root,folder)
             try:
                 os.mkdir(folder)
             except Exception,e:
                 print "%s created error" %(folder)
                 i = i + 1
                 j = j + 1
             else:
                 href = "http://tongji.cnki.net/kns55/Navi/" + li.attr("href")
                 i = i + 1
                 j = j + 1
                 self.threads.append(YearPage(os.path.join(self.root,li.text()),href))
         for t in self.threads:
             if not t.isAlive():
                 t.start()
         t.join()
         self.threads = []
Beispiel #3
0
     self.thread_
 def parse_yearbook_page(self):
     req = urllib2.Request(self.url,None,self.req_header)
     try:
         response = urllib2.urlopen(req)
     except Exception,e:
         f = open(os.path.join(self.root,"year_error.txt"),'a')
         f.write('%s---%s\n' % (self.root,self.url))
         f.close()
     else:
         rawdata= myutils.ungzip(response)
 #         print rawdata
         pquery = PyQuery(rawdata.decode('utf-8'))
         for li in pquery(".TreeList li"):
 
             self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text())
             
             while os.path.exists(os.path.join(self.root,self.pfolder)):
                 self.pfolder = self.pfolder + "_2"
             try:
                 os.mkdir(os.path.join(self.root,self.pfolder))
             except Exception,e:
                 print "%s created error" %(os.path.join(self.root,self.pfolder))
             else:
                 strParam = PyQuery(li)("a").attr('onclick')
                 aParam = strParam.split('(')[1].strip(')').split(',')
                 param = {}
                 param["id"] = aParam[0].strip().strip("'")
                 param["code"] = aParam[1].strip().strip("'")+ "?"
                 param["type"] = aParam[2].strip().strip("'")
                 param["fileid"] = aParam[3].strip().strip("'")