def get_pdfs_from_data_json(abs_pdf_restore_dir, json_file_name): f = open(json_file_name, 'r') for line in f.readlines(): list_dict = json.loads(line)['list'] for i, item in enumerate(list_dict): # print i,'---------' # print item pdf_url = item['pdf_url'] count = 0 resp = None while count < 10: resp = reqst.get(pdf_url) if resp.status_code == 200 and resp.content: with open( '%s/%s' % (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]), 'wb') as f: f.write(resp.content) break else: count += 1 if count == 10: print '%s, get_error_pdf' % pdf_url continue if count != 10: list_dict[i]['abs_path'] = '%s/%s' % (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]) # print list_dict CrawlerUtils.json_dump_to_file( '%s%s%s' % (json_file_name[:-5], '_insert', json_file_name[-5:]), {'list': list_dict}) f.close()
def test_parse_shareholder_detail_page(self): with open('./enterprise_crawler/zongju/shareholder_detail.html') as f: page = f.read() result = self.parser.parse_ind_comm_pub_shareholder_detail_page( page) CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path, {self.crawler.ent_number: result})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.result_json_dict = {} self.id = self.get_id_num(findCode) print self.id resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_one(self.one_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_two(self.two_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_three(self.three_dict, soup.find_all('table')) resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120) soup = BeautifulSoup(resp.content) self.get_json_four(self.four_dict, soup.find_all('table')) CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, ent_name=None): if ent_name is None: return False crawler = NameToIDCrawler( './enterprise_crawler/nametoid/name_to_id.json') crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ') # 对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + crawler.ent_name + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) page = crawler.crawl_page_by_get_params(crawler.ent_name) crawler.results = crawler.parser.parse_search_page(page=page) # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_name: crawler.results}) self.write_file_mutex.release() return True
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.uuid = self.get_id_num(findCode) print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.id = self.get_id_num(findCode) print self.id self.result_json_dict = {} #self.result_json_dict[findCode] = {} tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id) self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id) self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id) self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id) self.get_json_four(self.four_dict, tablefour) #self.write_file_mutex.acquire() print {self.ent_number: self.result_json_dict} CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, ent_number=0): crawler = ChongqingClawer( './enterprise_crawler/chongqing/chongqing.json') crawler.ent_number = str(ent_number) # 对每个企业都指定一个html的存储目录 crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) crawler.ent_number = str(ent_number) page = crawler.crawl_check_page() try: crawler.crawl_page_jsons(page) crawler.parser.parse_jsons() crawler.parser.merge_jsons() except Exception as e: # settings.logger.error('error') return False # 采用多线程,在写入文件时需要注意加锁 self.write_file_mutex.acquire() CrawlerUtils.json_dump_to_file(self.json_restore_path, {crawler.ent_number: crawler.json_dict}) self.write_file_mutex.release() return True
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) self.pripid = self.get_id_num(findCode) print findCode, self.pripid self.result_json_dict = {} data = { 'method': 'qyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk1', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) # print BeautifulSoup(resp.content).prettify self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'基本信息', u'股东信息', u'变更信息') data = { 'method': 'baInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk2', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'主要人员信息', u'分支机构信息', u'清算信息') data = { 'method': 'dcdyInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=120) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'动产抵押登记信息') data = { 'method': 'gqczxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk4', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'股权出质登记信息') data = { 'method': 'jyycInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk6', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'经营异常信息') data = { 'method': 'yzwfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk14', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'严重违法信息') data = { 'method': 'cfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk3', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'ccjcInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk7', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.one_dict, BeautifulSoup(resp.content).find_all('table'), u'抽查检查信息') data = { 'method': 'qygsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk8', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'企业年报') data = { 'method': 'qygsForTzrxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk12', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股东及出资信息', u'变更信息') data = { 'method': 'cqygsForTzrbgxxInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk15', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'股权变更信息') data = { 'method': 'qygsForXzxkInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk10', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qygsForZzcqInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk11', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'知识产权出质登记信息') data = { 'method': 'qygsForXzcfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk13', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.two_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'qtgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk9', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政许可信息') data = { 'method': 'qtgsForCfInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk16', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.three_dict, BeautifulSoup(resp.content).find_all('table'), u'行政处罚信息') data = { 'method': 'sfgsInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk17', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股权冻结信息') data = { 'method': 'sfgsbgInfo', 'maent.pripid': self.pripid, 'czmk': 'czmk18', 'random': self.cur_time } resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do', data=data, timeout=180) self.get_json_one(self.four_dict, BeautifulSoup(resp.content).find_all('table'), u'司法股东变更登记信息') self.result_json_dict[ 'ind_comm_pub_reg_basic'] = self.result_json_dict[ 'ind_comm_pub_reg_basic'][0] if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys( ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0: self.result_json_dict[ 'ind_comm_pub_arch_liquidation'] = self.result_json_dict[ 'ind_comm_pub_arch_liquidation'][0] CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = str(findCode) #对每个企业都指定一个html的存储目录 self.html_restore_path = self.html_restore_path + self.ent_number + '/' if settings.save_html and not os.path.exists(self.html_restore_path): CrawlerUtils.make_dir(self.html_restore_path) nbxh = self.get_id_num(findCode) self.nbxh = nbxh result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '5') print result_dict self.get_json_one(allths=[ u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所', u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态' ], alltds=result_dict, alltds_keys=[ u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb', u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw', u'djjgmc', u'hzrq', u'mclxmc' ], head='ind_comm_pub_reg_basic') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '3') print result_dict self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'], alltds=result_dict, alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'], head='ind_comm_pub_reg_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2', '3') print result_dict self.get_json_one( allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'], alltds=result_dict, alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'], head='ind_comm_pub_reg_shareholder') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '8') print result_dict self.get_json_one(allths=[u'序号', u'姓名', u'职务'], alltds=result_dict, alltds_keys=[u'rownum', u'xm', u'zwmc'], head='ind_comm_pub_arch_key_persons') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '36') print result_dict self.get_json_one(allths=[u'清算负责人', u'清算组成员'], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_arch_liquidation') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '9') print result_dict self.get_json_one( allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'], alltds=result_dict, alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'], head='ind_comm_pub_arch_branch') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '25') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_movable_property_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '4') print result_dict self.get_json_one(allths=[ u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码', u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况' ], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_equity_ownership_reg') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '1') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '33') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_business_exception') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '34') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_serious_violate_law') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '35') print result_dict self.get_json_one(allths=[], alltds=result_dict, alltds_keys=[], head='ind_comm_pub_spot_check') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '13') print result_dict self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'], alltds=result_dict, alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'], head='ent_pub_ent_annual_report') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '40') print result_dict self.get_json_two(allths=[ u'股东', u'认缴额(万元)', u'实缴额(万元)', u'认缴出资方式', u'认缴出资额(万元)', u'认缴出资日期', u'认缴公示日期', u'实缴出资方式', u'实缴出资额(万元)', u'实缴出资日期', u'实缴公示日期' ], alltds=result_dict, alltds_keys=[ u'tzrmc', u'ljrje', u'ljsje', u'rjczfs', u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs', u'sjcze', u'sjczrq', u'sjgsrq' ], head='ent_pub_shareholder_capital_contribution') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '23') print result_dict self.get_json_two( allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'], alltds=result_dict, alltds_keys=[], head='ent_pub_equity_change') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '20') print result_dict self.get_json_two(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容', u'状态', u'公示日期', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx', u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq', u'lsh' ], head='ent_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '21') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_knowledge_property') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '22') print result_dict self.get_json_two(allths=[], alltds=result_dict, alltds_keys=[], head='ent_pub_shareholder_modify') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '37') print result_dict self.get_json_three(allths=[ u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关', u'许可内容', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[ u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1', u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt' ], head='other_dept_pub_administration_license') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh, '0', '38') print result_dict self.get_json_two(allths=[ u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称', u'作出行政处罚决定日期' ], alltds=result_dict, alltds_keys=[], head='other_dept_pub_administration_sanction') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '49') print result_dict self.get_json_four(allths=[ u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情' ], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_equity_freeze') result_dict = self.send_post( 'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0', '53') print result_dict self.get_json_four( allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'], alltds=result_dict, alltds_keys=[], head='judical_assist_pub_shareholder_modify') CrawlerUtils.json_dump_to_file( self.json_restore_path, {self.ent_number: self.result_json_dict})
def run(self, findCode): self.ent_number = findCode id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() print id_args if id_args and id_args.download_args.get('uuid'): self.result_json_dict = {} self.uuid = id_args.download_args['uuid'] tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) return [{self.ent_number: self.result_json_dict}] else: #创建目录 html_restore_path = self.json_restore_path + '/yunnan/' if not os.path.exists(html_restore_path): os.makedirs(html_restore_path) self.uuid = self.get_id_num(findCode) if self.uuid is None: return json.dumps({self.ent_number: {}}) self.result_json_dict_list = [] for div in BeautifulSoup(self.after_crack_checkcode_page, 'html.parser').find_all( 'div', attrs={'class': 'list-item'}): hrefa = div.find_all('a', attrs={'target': '_blank'})[0] if hrefa: self.uuid = hrefa['href'].split('&')[0] self.enterprise_name = div.find_all( 'div', attrs={'class': 'link'})[0].get_text().strip() self.ent_number = div.find_all( 'span')[0].get_text().strip() args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\ or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \ or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first() if args: args.delete() args = CrawlerDownloadArgs( province='yunnan', register_number=self.ent_number, unifield_number=self.ent_number, enterprise_name=self.enterprise_name, download_args={'uuid': self.uuid}) args.save() else: continue print self.uuid self.result_json_dict = {} tableone = self.get_tables(self.uuid + '&tab=01') self.get_json_one(self.one_dict, tableone) tabletwo = self.get_tables(self.uuid + '&tab=02') self.get_json_two(self.two_dict, tabletwo) tablethree = self.get_tables(self.uuid + '&tab=03') self.get_json_three(self.three_dict, tablethree) tablefour = self.get_tables(self.uuid + '&tab=06') self.get_json_four(self.four_dict, tablefour) CrawlerUtils.json_dump_to_file( 'yunnan.json', {self.ent_number: self.result_json_dict}) print json.dumps({self.ent_number: self.result_json_dict}) self.result_json_dict_list.append( {self.ent_number: self.result_json_dict}) return self.result_json_dict_list
def test_parse_annual_report_page(self): with open('./enterprise_crawler/zongju/annual_report.html') as f: page = f.read() result = self.parser.parse_ent_pub_annual_report_page(page) CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path, {self.crawler.ent_number: result})
for line in f.readlines(): # print type(json.loads(line)['list']) process = multiprocessing.Process( target=get_pdf, args=(json_file_item, json.loads(line)['list'])) process.daemon = True process.start() process.join(max_crawl_time) print 'child process exit' # settings.logger.info('child process exit') f.close() need_dict = {} for pdf_item in os.listdir( '%s/%s/%s/%s/' % (settings.pdf_restore_dir, json_file_item[:4], json_file_item[4:6], json_file_item[6:])): # print pdf_item if pdf_item.split('.')[1] == 'pdf': need_dict[ 'http://rmfygg.court.gov.cn/psca/lgnot/bulletin/download/' + pdf_item] = '%s/%s/%s/%s/%s' % ( os.path.abspath(os.curdir), json_file_item[:4], json_file_item[4:6], json_file_item[6:], pdf_item) # print pdf_item CrawlerUtils.json_dump_to_file( '%s/%s/%s/%s/%s%s' % (settings.pdf_restore_dir, json_file_item[:4], json_file_item[4:6], json_file_item[6:], json_file_item, '_pdf.json'), need_dict) # need_dict = {}