def get_year_of_nianjian(self): req = urllib2.Request(self.url, None, self.req_header) response = urllib2.urlopen(req) rawdata = myutils.ungzip(response) year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a")) i = 0 while i < year_num: j = 0 while j < 5 and i < year_num: li = PyQuery( PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i]) folder = myutils.filenameCheck(li.text()) folder = os.path.join(self.root, folder) try: os.mkdir(folder) except Exception, e: print "%s created error" % (folder) i = i + 1 j = j + 1 else: href = "http://tongji.cnki.net/kns55/Navi/" + li.attr( "href") i = i + 1 j = j + 1 self.threads.append( YearPage(os.path.join(self.root, li.text()), href)) for t in self.threads: if not t.isAlive(): t.start() t.join() self.threads = []
def get_year_of_nianjian(self): req = urllib2.Request(self.url,None,self.req_header) response = urllib2.urlopen(req) rawdata= myutils.ungzip(response) year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a")) i = 0 while i < year_num: j = 0 while j < 5 and i < year_num: li = PyQuery(PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i]) folder = myutils.filenameCheck(li.text()) folder = os.path.join(self.root,folder) try: os.mkdir(folder) except Exception,e: print "%s created error" %(folder) i = i + 1 j = j + 1 else: href = "http://tongji.cnki.net/kns55/Navi/" + li.attr("href") i = i + 1 j = j + 1 self.threads.append(YearPage(os.path.join(self.root,li.text()),href)) for t in self.threads: if not t.isAlive(): t.start() t.join() self.threads = []
self.thread_ def parse_yearbook_page(self): req = urllib2.Request(self.url,None,self.req_header) try: response = urllib2.urlopen(req) except Exception,e: f = open(os.path.join(self.root,"year_error.txt"),'a') f.write('%s---%s\n' % (self.root,self.url)) f.close() else: rawdata= myutils.ungzip(response) # print rawdata pquery = PyQuery(rawdata.decode('utf-8')) for li in pquery(".TreeList li"): self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text()) while os.path.exists(os.path.join(self.root,self.pfolder)): self.pfolder = self.pfolder + "_2" try: os.mkdir(os.path.join(self.root,self.pfolder)) except Exception,e: print "%s created error" %(os.path.join(self.root,self.pfolder)) else: strParam = PyQuery(li)("a").attr('onclick') aParam = strParam.split('(')[1].strip(')').split(',') param = {} param["id"] = aParam[0].strip().strip("'") param["code"] = aParam[1].strip().strip("'")+ "?" param["type"] = aParam[2].strip().strip("'") param["fileid"] = aParam[3].strip().strip("'")