def get_year_of_nianjian(self): req = urllib2.Request(self.url, None, self.req_header) response = urllib2.urlopen(req) rawdata = myutils.ungzip(response) year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a")) i = 0 while i < year_num: j = 0 while j < 5 and i < year_num: li = PyQuery( PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i]) folder = myutils.filenameCheck(li.text()) folder = os.path.join(self.root, folder) try: os.mkdir(folder) except Exception, e: print "%s created error" % (folder) i = i + 1 j = j + 1 else: href = "http://tongji.cnki.net/kns55/Navi/" + li.attr( "href") i = i + 1 j = j + 1 self.threads.append( YearPage(os.path.join(self.root, li.text()), href)) for t in self.threads: if not t.isAlive(): t.start() t.join() self.threads = []
def download_result(self,fileHref): url = "http://tongji.cnki.net/kns55" + fileHref.strip("..") req = urllib2.Request(url=url,data=None,headers=self.req_header) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar)) opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36')] try: response = opener.open(req) if response.info().getheader("Content-Type").find("text/html") >=0: data = myutils.ungzip(response).decode('utf-8') if data.find(u'用户登录') > 0 and self.failTimes < 3: myutils.get_cookieJar() self.cookieJar = myutils.get_cookieJar() self.failTimes = self.failTimes + 1 # response = opener.open(req) # self.save_response(response) self.download_result(fileHref) elif data.find(u'用户登录') > 0 and self.failTimes >= 3: self.failTimes = 0 return # for ck in g_cookieJar: # print '%s,%s' % (ck.name,ck.value) if response.info().getheader("Content-Type").find("pdf")>=0 or response.info().getheader("Content-Type").find("octet-stream")>=0: # cj = copy.copy(self.cookieJar) # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler) # response = opener.open(req) self.save_response(response) time.sleep(1) except Exception,e: # print e.msg.decode('gbk') f = open(os.path.join(self.root,"download_error.txt"),'a') f.write('%s---%s---%s---%s\n' % (self.root , self.pfolder ,self.filename , url )) f.close()
def get_year_of_nianjian(self): req = urllib2.Request(self.url,None,self.req_header) response = urllib2.urlopen(req) rawdata= myutils.ungzip(response) year_num = len(PyQuery(rawdata.decode('utf-8'))(".list_h li a")) i = 0 while i < year_num: j = 0 while j < 5 and i < year_num: li = PyQuery(PyQuery(rawdata.decode('utf-8'))(".list_h li a")[i]) folder = myutils.filenameCheck(li.text()) folder = os.path.join(self.root,folder) try: os.mkdir(folder) except Exception,e: print "%s created error" %(folder) i = i + 1 j = j + 1 else: href = "http://tongji.cnki.net/kns55/Navi/" + li.attr("href") i = i + 1 j = j + 1 self.threads.append(YearPage(os.path.join(self.root,li.text()),href)) for t in self.threads: if not t.isAlive(): t.start() t.join() self.threads = []
def run(self): self.parse_yearbook_page() def stop(self): self.thread_ def parse_yearbook_page(self): req = urllib2.Request(self.url,None,self.req_header) try: response = urllib2.urlopen(req) except Exception,e: f = open(os.path.join(self.root,"year_error.txt"),'a') f.write('%s---%s\n' % (self.root,self.url)) f.close() else: rawdata= myutils.ungzip(response) # print rawdata pquery = PyQuery(rawdata.decode('utf-8')) for li in pquery(".TreeList li"): self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text()) while os.path.exists(os.path.join(self.root,self.pfolder)): self.pfolder = self.pfolder + "_2" try: os.mkdir(os.path.join(self.root,self.pfolder)) except Exception,e: print "%s created error" %(os.path.join(self.root,self.pfolder)) else: strParam = PyQuery(li)("a").attr('onclick') aParam = strParam.split('(')[1].strip(')').split(',')