reg = r'doOpen\(\'(.*?)\'\)' pattern = re.compile(reg) pri = pattern.findall(str(priName))[0] regID = regIDlist[i].contents[0] entdict = dict(Name=Name, regID=regID, Date=cdate, pri=pri) self.PrintInfo(entdict) except Exception: self.printitemerror(pageNos, i) continue if br == 1: break def PrintInfo(self, ent): req = Request(url='http://gsxt.scaic.gov.cn/ztxy.do', data=self.getinfopostdata(ent.get('pri')), headers={'User-Agent': 'Magic Browser'}) inforesult = self.gethtml(req) infolist = inforesult.find('tr', attrs={'name': 'yc'}).findAll('td') self.gendown(ent, infolist) if __name__ == '__main__': location = '四川' YCParser = GetYCParser() YCParser.GetYC(location, startdate=date(1900, 11, 1), enddate=date.today() - timedelta(days=0), fmode='a')
info = infolist[i] reg = r'"specause":"(.*?)"' pattern = re.compile(reg) inreason = pattern.findall(info) f.write(inreason[0] + '|') reg = r'"abntime":"(.*?)"' pattern = re.compile(reg) intime = pattern.findall(info) f.write(intime[0] + '|') reg = r'"remexcpres":"(.*?)"' pattern = re.compile(reg) outreason = pattern.findall(info) if outreason: f.write(outreason[0]) f.write('|') reg = r'"remdate":"(.*?)"' pattern = re.compile(reg) outtime = pattern.findall(info) if outtime: f.write(outtime[0]) f.write('|') reg = r'"decorg":"(.*?)"' pattern = re.compile(reg) org = pattern.findall(info) f.write(org[0] + '|') f.write('\n') if __name__ == '__main__': location = '重庆' YCParser = GetYCParser() YCParser.GetYC(location, startdate=date(1900, 10, 9), enddate=date.today())
if br==1:break def PrintInfo(self,ent,f): time.sleep(2) req=Request( url='http://aic.hainan.gov.cn:1888/aiccips/GSpublicity/GSpublicityList.html?service=cipUnuDirInfo', data=self.getinfopostdata(ent), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0', 'Content-Length':'71', 'Cookie':self.gen_cookie(), 'Content-Type': 'application/x-www-form-urlencoded'}) inforesult=self.gethtml(req) infolist=inforesult.findAll('td') l=int(len(infolist)/6) for j in range(l): f.write(self.to_utf8(ent.get('Name'))+'|') f.write(self.to_utf8(ent.get('reg').strip())+'|') for k in range(6): i=j*6+k infostr=infolist[i].contents if infostr: infostr=infostr[0] f.write(self.to_utf8(infostr.replace('\n','').strip())) f.write('|') f.write('\n') if __name__=='__main__': location='海南' YCParser=GetYCParser() YCParser.GetYC(location,startdate=date(2015,8,10),enddate=date.today()-timedelta(days=0))
if self.checkname(Name) == False: continue regID = regIDlist[i].contents[0] regID = self.dealID(regID) href = infolist[i].get('href') entdict = dict(Name=Name, regID=regID, Date=cdate, href=href) self.PrintInfo(entdict) except Exception as e: print e print traceback.print_exc() self.printitemerror(pageNos, i) continue if br == 1: break def PrintInfo(self, ent): req = Request(url='http://www.ahcredit.gov.cn' + ent.get('href'), headers={'User-Agent': 'Magic Browser'}) inforesult = self.gethtml(req) infolist = inforesult.find('table', attrs={ 'id': 'excTab' }).findAll('td') self.gendown(ent, infolist) if __name__ == '__main__': location = '安徽' YCParser = GetYCParser() YCParser.GetYC(location, startdate=date(2015, 01, 1), enddate=date.today())
if k==1:break def PrintInfo(self,ent,f): #取得注册号 infourl='http://tjcredit.gov.cn/platform/saic/viewBaseExc.ftl?entId='+ent.get('entId') inforesult=self.gethtml(infourl) id=inforesult.findAll('span')[1].contents[0][5:] #取得经营异常信息 infourl='http://tjcredit.gov.cn/platform/saic/baseInfo.json?entId='+ent.get('entId')+'&departmentId=scjgw&infoClassId=qyjyycmlxx' inforesult=self.gethtml(infourl) infolist=inforesult.findAll('td',attrs={'class':''}) l=int(len(infolist)/6) for j in range(l): f.write(ent.get('Name')+'|') f.write(id+'|') for k in range(6): i=j*6+k infostr=infolist[i].contents if infostr: infostr=infostr[0] if i==2:f.write(str(self.changedate(str(infostr)))) else:f.write(infostr.replace('\n','').strip()) f.write('|') f.write('\n') if __name__=='__main__': location='天津' YCParser=GetYCParser() YCParser.GetYC(location,startdate=date(1900,10,8),enddate=date.today()-timedelta(days=0))
day=int(cdate[k2+1:k3-1]) else: k1=cdate.find('-') if k1!=-1: k2=cdate[5:].find('-')+5 month=int(cdate[5:k2]) day=int(cdate[k2+1:k3]) else: print(cdate+'\n') year=input('year=') month=input('month=') day=input('day=') return date(year,month,day) if __name__=='__main__': yc=YCParser() rs='D:/GSXT/GSXTresult/' dirlist=os.listdir(rs) frecord=open('D:/GSXT/GSXT整理temp.txt','w') total=0 ntotal=0 idlist={} for dirr in dirlist: f=open(rs+dirr,'r') k=dirr.find('.') prov=dirr[:k] #取省份 recordlist={} #记录字典 yclist={} for line in f.readlines(): total+=1 if total % 10000==0:print(total)
else: k1 = cdate.find('-') if k1 != -1: k2 = cdate[5:].find('-') + 5 month = int(cdate[5:k2]) day = int(cdate[k2 + 1:k3]) else: print(cdate + '\n') year = input('year=') month = input('month=') day = input('day=') return date(year, month, day) if __name__ == '__main__': yc = YCParser() rs = 'D:/GSXT/GSXTresult/' dirlist = os.listdir(rs) frecord = open('D:/GSXT/GSXT整理temp.txt', 'w') total = 0 ntotal = 0 idlist = {} for dirr in dirlist: f = open(rs + dirr, 'r') k = dirr.find('.') prov = dirr[:k] #取省份 recordlist = {} #记录字典 yclist = {} for line in f.readlines(): total += 1 if total % 10000 == 0: print(total)
href=href) self.PrintInfo(entdict) except Exception: self.printitemerror(pageNos, i) continue if br == 1: break def PrintInfo(self, ent): req = urllib.request.Request( url='http://222.143.24.157' + ent.get('href'), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0' }) inforesult = self.gethtml(req) infolist = inforesult.find('table', attrs={ 'id': 'excTab' }).findAll('td') self.gendown(ent, infolist) if __name__ == '__main__': location = '河南' YCParser = GetYCParser() YCParser.GetYC(location, startdate=date(1900, 10, 8), enddate=date.today() - timedelta(days=0), fmode='a', pagemode='a', itemmode='a')