def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.result_json_dict = {} self.id = self.get_id_num(findCode) print self.id resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_one(self.one_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_two(self.two_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_three(self.three_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_four(self.four_dict, soup.find_all('table')) CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def main(): config_logging() if not os.path.exists(settings.json_restore_path): CrawlerUtils.make_dir(settings.json_restore_path) cur_date = CrawlerUtils.get_cur_y_m_d() set_codecracker() if len(sys.argv) >= 2 and sys.argv[1] == "check": dt = None if len(sys.argv) == 3: dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d") checker = Checker(dt) checker.run() return if len(sys.argv) < 3: print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数,以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)' return try: max_crawl_time = int(sys.argv[1]) settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time) except ValueError as e: settings.logger.error('invalid max_crawl_time, should be a integer') os._exit(1) timer = threading.Timer(max_crawl_time, force_exit) timer.start() settings.logger.info(u'即将开始爬取,最长爬取时间为 %s 秒' % settings.max_crawl_time) settings.start_crawl_time = datetime.datetime.now() if sys.argv[2] == 'all': args = [p for p in sorted(province_crawler.keys())] process_pool = MyPool() process_pool.map(crawl_province, args) process_pool.close() settings.logger.info("wait processes....") process_pool.join() else: provinces = sys.argv[2:] for p in provinces: if not p in province_crawler.keys(): settings.logger.warn('province %s is not supported currently' % p) continue crawl_province(p)
def down_yesterday_pdf(yesterday): yesterday = yesterday abs_yesterday_json_url = '%s/%s/%s/%s/%s' % (settings.host, settings.ID, yesterday[:4], yesterday[4:6], yesterday[6:]) # print 'abs_yesterday_json_url:', abs_yesterday_json_url need_down_json_file_name = get_need_down_json_file_name( abs_yesterday_json_url) if need_down_json_file_name is None: print '-error__from_%s____no_data' % abs_yesterday_json_url return else: abs_yesterday_json_url = '%s/%s' % (abs_yesterday_json_url, need_down_json_file_name) # print 'abs_yesterday_json_url:',abs_yesterday_json_url abs_json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_dir, yesterday[:4], yesterday[4:6], yesterday[6:]) if not os.path.exists(abs_json_restore_dir): CrawlerUtils.make_dir(abs_json_restore_dir) abs_pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, yesterday[:4], yesterday[4:6], yesterday[6:]) if not os.path.exists(abs_pdf_restore_dir): CrawlerUtils.make_dir(abs_pdf_restore_dir) # print 'abs_json_restore_dir:', abs_json_restore_dir get_json_file_OK = get_data_json_file(abs_yesterday_json_url, abs_json_restore_dir, need_down_json_file_name) if get_json_file_OK is False: print '-error--nodata_from_%s%s' % (abs_json_restore_dir, need_down_json_file_name) return else: abs_yesterday_json_gz_file_name = '%s/%s' % ( abs_json_restore_dir, need_down_json_file_name) abs_yesterday_json_file_name = '%s/%s%s' % (abs_json_restore_dir, yesterday, '.json') # print 'abs_yesterday_json_file_name:',abs_yesterday_json_file_name # print 'abs_yesterday_json_gz_file_name:', abs_yesterday_json_gz_file_name g = gzip.GzipFile(mode='rb', fileobj=open(abs_yesterday_json_gz_file_name, 'rb')) open(abs_yesterday_json_file_name, 'wb').write(g.read()) if os.path.isfile(abs_yesterday_json_gz_file_name): os.remove(abs_yesterday_json_gz_file_name) get_pdfs_from_data_json(abs_pdf_restore_dir, abs_yesterday_json_file_name) pass
def crawl_province(province): settings.logger.info('ready to clawer %s' % province) #创建存储路径 json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_path, province, cur_date[0], cur_date[1]) if not os.path.exists(json_restore_dir): CrawlerUtils.make_dir(json_restore_dir) #获取企业名单 enterprise_list_path = settings.enterprise_list_path + province + '.txt' #json存储文件名 json_restore_path = '%s/%s.json' % (json_restore_dir, cur_date[2]) with open(enterprise_list_path) as f: for line in f: fields = line.strip().split(",") if len(fields) < 3: continue no = fields[2] process = multiprocessing.Process(target=crawl_work, args=(province, json_restore_path, no)) process.daemon = True process.start() process.join(300) settings.logger.info('All %s crawlers work over' % province) #压缩保存 if not os.path.exists(json_restore_path): settings.logger.warn('json restore path %s does not exist!' % json_restore_path) os._exit(1) return with open(json_restore_path, 'r') as f: data = f.read() compressed_json_restore_path = json_restore_path + '.gz' with gzip.open(compressed_json_restore_path, 'wb') as cf: cf.write(data) #删除json文件,只保留 .gz 文件 os.remove(json_restore_path) os._exit(0)
def run(self, ent_name=None): if ent_name is None: return False crawler = NameToIDCrawler( './enterprise_crawler/nametoid/name_to_id.json') crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ') # 对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + crawler.ent_name + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) page = crawler.crawl_page_by_get_params(crawler.ent_name) crawler.results = crawler.parser.parse_search_page(page=page) # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_name: crawler.results}) self.write_file_mutex.release() return True
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.uuid = self.get_id_num(findCode) print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) if not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.result_json_dict = {} self.id = self.get_id_num(findCode) if self.id is None: return sjon.dumps({self.ent_number: {}}) # print self.id resp = self.reqst.get( 'http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_one(self.one_dict, soup.find_all('table')) resp = self.reqst.get( 'http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_two(self.two_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_three(self.three_dict, soup.find_all('table')) resp = self.reqst.get( 'http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_four(self.four_dict, soup.find_all('table')) return json.dumps({self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.id = self.get_id_num(findCode) print self.id self.result_json_dict = {} #self.result_json_dict[findCode] = {} tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id) self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id) self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id) self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id) self.get_json_four(self.four_dict, tablefour) #self.write_file_mutex.acquire() print {self.ent_number: self.result_json_dict} CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, ent_number=0): crawler = ChongqingClawer( './enterprise_crawler/chongqing/chongqing.json') crawler.ent_number = str(ent_number) # 对每个企业都指定一个html的存储目录 crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) crawler.ent_number = str(ent_number) page = crawler.crawl_check_page() try: crawler.crawl_page_jsons(page) crawler.parser.parse_jsons() crawler.parser.merge_jsons() except Exception as e: # settings.logger.error('error') return False # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_number: crawler.json_dict}) self.write_file_mutex.release() return True
def get_pdf(save_path, list_dict): pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, save_path[:4], save_path[4:6], save_path[6:]) if not os.path.exists(pdf_restore_dir): CrawlerUtils.make_dir(pdf_restore_dir) for item in list_dict: pdf_url = item['pdf_url'] count = 0 while count < 10: resp = reqst.get(pdf_url) if resp.status_code == 200 and resp.content: with open( os.path.join(pdf_restore_dir, pdf_url.rsplit('/')[-1]), 'wb') as f: f.write(resp.content) break else: count += 1 if count == 10: print '%s,get-error' % pdf_url # settings.logger.info('%s,get-error' % pdf_url) continue
def run(self, findCode): self.ent_number = str(findCode) if not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.id = self.get_id_num(findCode) if self.id is None: return json.dumps({self.ent_number: {}}) # print self.id self.result_json_dict = {} tableone = self.get_tables(self.search_dict['businessPublicity'] + 'id=' + self.id) self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.search_dict['enterprisePublicity'] + 'id=' + self.id) self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.search_dict['otherDepartment'] + 'id=' + self.id) self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.search_dict['justiceAssistance'] + 'id=' + self.id) self.get_json_four(self.four_dict, tablefour) return json.dumps({self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.pripid = self.get_id_num(findCode) print findCode, self.pripid self.result_json_dict = {} data = { 'method': 'qyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk1', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) # print BeautifulSoup(resp.content).prettify self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'基本信息', u'股东信息', u'变更信息') data = { 'method': 'baInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk2', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'主要人员信息', u'分支机构信息', u'清算信息') data = { 'method': 'dcdyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=120) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'动产抵押登记信息') data = { 'method': 'gqczxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'股权出质登记信息') data = { 'method': 'jyycInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk6', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'经营异常信息') data = { 'method': 'yzwfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk14', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'严重违法信息') data = { 'method': 'cfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk3', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'ccjcInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk7', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'抽查检查信息') data = { 'method': 'qygsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk8', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'企业年报') data = { 'method': 'qygsForTzrxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk12', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股东及出资信息', u'变更信息') data = { 'method': 'cqygsForTzrbgxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk15', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股权变更信息') data = { 'method': 'qygsForXzxkInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk10', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qygsForZzcqInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk11', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'知识产权出质登记信息') data = { 'method': 'qygsForXzcfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk13', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'qtgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk9', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qtgsForCfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk16', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'sfgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk17', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股权冻结信息') data = { 'method': 'sfgsbgInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk18', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股东变更登记信息') self.result_json_dict[ 'ind_comm_pub_reg_basic'] = self.result_json_dict[ 'ind_comm_pub_reg_basic'][0] if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys( ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0: self.result_json_dict[ 'ind_comm_pub_arch_liquidation'] = self.result_json_dict[ 'ind_comm_pub_arch_liquidation'][0] CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) nbxh = self.get_id_num(findCode) self.nbxh = nbxh result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '5') print result_dict self.get_json_one(allths=[ u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所', u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态' ], alltds=result_dict, alltds_keys=[ u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb', u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw', u'djjgmc', u'hzrq', u'mclxmc' ], head='ind_comm_pub_reg_basic') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '3') print result_dict self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'], alltds=result_dict, alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'], head='ind_comm_pub_reg_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2', '3') print result_dict self.get_json_one( allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'], alltds=result_dict, alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'], head='ind_comm_pub_reg_shareholder') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '8') print result_dict self.get_json_one(allths=[u'序号', u'姓名', u'职务'], alltds=result_dict, alltds_keys=[u'rownum', u'xm', u'zwmc'], head='ind_comm_pub_arch_key_persons') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '36') print result_dict self.get_json_one(allths=[u'清算负责人', u'清算组成员'], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_arch_liquidation') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '9') print result_dict self.get_json_one( allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'], alltds=result_dict, alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'], head='ind_comm_pub_arch_branch') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '25') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_movable_property_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '4') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码', u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_equity_ownership_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '1') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '33') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_business_exception') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '34') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_serious_violate_law') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '35') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_spot_check') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '13') print result_dict self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'], alltds=result_dict, alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'], head='ent_pub_ent_annual_report') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '40') print result_dict self.get_json_two(allths=[ u'股东', u'认缴额(万元)', u'实缴额(万元)', u'认缴出资方式', u'认缴出资额(万元)', u'认缴出资日期', u'认缴公示日期', u'实缴出资方式', u'实缴出资额(万元)', u'实缴出资日期', u'实缴公示日期' ], alltds=result_dict, alltds_keys=[ u'tzrmc', u'ljrje', u'ljsje', u'rjczfs', u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs', u'sjcze', u'sjczrq', u'sjgsrq' ], head='ent_pub_shareholder_capital_contribution') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '23') print result_dict self.get_json_two( allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'], alltds=result_dict, alltds_keys=[], head='ent_pub_equity_change') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '20') print result_dict self.get_json_two(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx', u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq', u'lsh' ], head='ent_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '21') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_knowledge_property') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '22') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_shareholder_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '37') print result_dict self.get_json_three(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关', u'许可内容', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1', u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt' ], head='other_dept_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '38') print result_dict self.get_json_two(allths=[ u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称', u'作出行政处罚决定日期' ], alltds=result_dict, alltds_keys=[], head='other_dept_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '49') print result_dict self.get_json_four(allths=[ u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_equity_freeze') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '53') print result_dict self.get_json_four( allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_shareholder_modify') CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})