Python CrawlerUtils.json_dump_to_fileの例、crawler.CrawlerUtils.json_dump_to_file Pythonの例

コード例 #1

0

ファイルを表示

def get_pdfs_from_data_json(abs_pdf_restore_dir, json_file_name):
    f = open(json_file_name, 'r')
    for line in f.readlines():
        list_dict = json.loads(line)['list']
        for i, item in enumerate(list_dict):
            # print i,'---------'
            # print item
            pdf_url = item['pdf_url']
            count = 0
            resp = None
            while count < 10:
                resp = reqst.get(pdf_url)
                if resp.status_code == 200 and resp.content:
                    with open(
                            '%s/%s' %
                        (abs_pdf_restore_dir, pdf_url.rsplit('/')[-1]),
                            'wb') as f:
                        f.write(resp.content)
                    break
                else:
                    count += 1
                    if count == 10:
                        print '%s, get_error_pdf' % pdf_url
                    continue
            if count != 10:
                list_dict[i]['abs_path'] = '%s/%s' % (abs_pdf_restore_dir,
                                                      pdf_url.rsplit('/')[-1])
        # print list_dict
        CrawlerUtils.json_dump_to_file(
            '%s%s%s' % (json_file_name[:-5], '_insert', json_file_name[-5:]),
            {'list': list_dict})
    f.close()

コード例 #2

0

ファイルを表示

ファイル: zongju_crawler.py プロジェクト: xiaohui2856/crawl

 def test_parse_shareholder_detail_page(self):
     with open('./enterprise_crawler/zongju/shareholder_detail.html') as f:
         page = f.read()
         result = self.parser.parse_ind_comm_pub_shareholder_detail_page(
             page)
         CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path,
                                        {self.crawler.ent_number: result})

コード例 #3

0

ファイルを表示

ファイル: guangxi_crawler.py プロジェクト: xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.result_json_dict = {}
        self.id = self.get_id_num(findCode)
        print self.id
        resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_one(self.one_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_two(self.two_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_three(self.three_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_four(self.four_dict, soup.find_all('table'))

        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})

コード例 #4

0

ファイルを表示

    def run(self, ent_name=None):
        if ent_name is None:
            return False
        crawler = NameToIDCrawler(
            './enterprise_crawler/nametoid/name_to_id.json')
        crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ')
        # 对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + crawler.ent_name + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        page = crawler.crawl_page_by_get_params(crawler.ent_name)
        crawler.results = crawler.parser.parse_search_page(page=page)
        # 采用多线程，在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_name: crawler.results})
        self.write_file_mutex.release()
        return True

コード例 #5

0

ファイルを表示

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.uuid = self.get_id_num(findCode)
        print self.uuid
        self.result_json_dict = {}

        tableone = self.get_tables(self.uuid + '&tab=01')
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.uuid + '&tab=02')
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.uuid + '&tab=03')
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.uuid + '&tab=06')
        self.get_json_four(self.four_dict, tablefour)

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})

コード例 #6

0

ファイルを表示

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.id = self.get_id_num(findCode)
        print self.id
        self.result_json_dict = {}
        #self.result_json_dict[findCode] = {}
        tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id)
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id)
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id)
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id)
        self.get_json_four(self.four_dict, tablefour)

        #self.write_file_mutex.acquire()
        print {self.ent_number: self.result_json_dict}
        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})

コード例 #7

0

ファイルを表示

    def run(self, ent_number=0):
        crawler = ChongqingClawer(
            './enterprise_crawler/chongqing/chongqing.json')

        crawler.ent_number = str(ent_number)
        # 对每个企业都指定一个html的存储目录
        crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)
        crawler.ent_number = str(ent_number)
        page = crawler.crawl_check_page()
        try:
            crawler.crawl_page_jsons(page)
            crawler.parser.parse_jsons()
            crawler.parser.merge_jsons()
        except Exception as e:
            # settings.logger.error('error')
            return False
        # 采用多线程，在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_number: crawler.json_dict})
        self.write_file_mutex.release()
        return True

コード例 #8

0

ファイルを表示

ファイル: sichuan_crawler.py プロジェクト: xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.pripid = self.get_id_num(findCode)
        print findCode, self.pripid
        self.result_json_dict = {}

        data = {
            'method': 'qyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk1',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        # print BeautifulSoup(resp.content).prettify
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'基本信息', u'股东信息', u'变更信息')

        data = {
            'method': 'baInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk2',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'主要人员信息', u'分支机构信息', u'清算信息')

        data = {
            'method': 'dcdyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=120)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'动产抵押登记信息')

        data = {
            'method': 'gqczxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权出质登记信息')

        data = {
            'method': 'jyycInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk6',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'经营异常信息')

        data = {
            'method': 'yzwfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk14',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'严重违法信息')

        data = {
            'method': 'cfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk3',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'ccjcInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk7',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'抽查检查信息')

        data = {
            'method': 'qygsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk8',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'企业年报')

        data = {
            'method': 'qygsForTzrxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk12',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股东及出资信息', u'变更信息')

        data = {
            'method': 'cqygsForTzrbgxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk15',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权变更信息')

        data = {
            'method': 'qygsForXzxkInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk10',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qygsForZzcqInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk11',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'知识产权出质登记信息')

        data = {
            'method': 'qygsForXzcfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk13',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'qtgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk9',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qtgsForCfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk16',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'sfgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk17',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股权冻结信息')

        data = {
            'method': 'sfgsbgInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk18',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股东变更登记信息')

        self.result_json_dict[
            'ind_comm_pub_reg_basic'] = self.result_json_dict[
                'ind_comm_pub_reg_basic'][0]
        if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys(
        ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0:
            self.result_json_dict[
                'ind_comm_pub_arch_liquidation'] = self.result_json_dict[
                    'ind_comm_pub_arch_liquidation'][0]
        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})

コード例 #9

0

ファイルを表示

ファイル: guizhou_crawler.py プロジェクト: xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        nbxh = self.get_id_num(findCode)
        self.nbxh = nbxh

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '5')
        print result_dict
        self.get_json_one(allths=[
            u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所',
            u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb',
                              u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw',
                              u'djjgmc', u'hzrq', u'mclxmc'
                          ],
                          head='ind_comm_pub_reg_basic')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '3')
        print result_dict
        self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'],
                          alltds=result_dict,
                          alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'],
                          head='ind_comm_pub_reg_modify')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2',
            '3')
        print result_dict
        self.get_json_one(
            allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'],
            alltds=result_dict,
            alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'],
            head='ind_comm_pub_reg_shareholder')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '8')
        print result_dict
        self.get_json_one(allths=[u'序号', u'姓名', u'职务'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'xm', u'zwmc'],
                          head='ind_comm_pub_arch_key_persons')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '36')
        print result_dict
        self.get_json_one(allths=[u'清算负责人', u'清算组成员'],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_arch_liquidation')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '9')
        print result_dict
        self.get_json_one(
            allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'],
            alltds=result_dict,
            alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'],
            head='ind_comm_pub_arch_branch')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '25')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_movable_property_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '4')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码',
            u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_equity_ownership_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '1')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_administration_sanction')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '33')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_business_exception')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '34')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_serious_violate_law')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '35')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_spot_check')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '13')
        print result_dict
        self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'],
                          head='ent_pub_ent_annual_report')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '40')
        print result_dict
        self.get_json_two(allths=[
            u'股东', u'认缴额（万元）', u'实缴额（万元）', u'认缴出资方式', u'认缴出资额（万元）', u'认缴出资日期',
            u'认缴公示日期', u'实缴出资方式', u'实缴出资额（万元）', u'实缴出资日期', u'实缴公示日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'tzrmc', u'ljrje', u'ljsje', u'rjczfs',
                              u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs',
                              u'sjcze', u'sjczrq', u'sjgsrq'
                          ],
                          head='ent_pub_shareholder_capital_contribution')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '23')
        print result_dict
        self.get_json_two(
            allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'],
            alltds=result_dict,
            alltds_keys=[],
            head='ent_pub_equity_change')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '20')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容',
            u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx',
                              u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq',
                              u'lsh'
                          ],
                          head='ent_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '21')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_knowledge_property')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '22')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_shareholder_modify')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '37')
        print result_dict
        self.get_json_three(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关',
            u'许可内容', u'状态', u'详情'
        ],
                            alltds=result_dict,
                            alltds_keys=[
                                u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1',
                                u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt'
                            ],
                            head='other_dept_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '38')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称',
            u'作出行政处罚决定日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='other_dept_pub_administration_sanction')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '49')
        print result_dict
        self.get_json_four(allths=[
            u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情'
        ],
                           alltds=result_dict,
                           alltds_keys=[],
                           head='judical_assist_pub_equity_freeze')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '53')
        print result_dict
        self.get_json_four(
            allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'],
            alltds=result_dict,
            alltds_keys=[],
            head='judical_assist_pub_shareholder_modify')

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})

コード例 #10

0

ファイルを表示

    def run(self, findCode):

        self.ent_number = findCode

        id_args = CrawlerDownloadArgs.objects.filter(register_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
           or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
        print id_args
        if id_args and id_args.download_args.get('uuid'):
            self.result_json_dict = {}
            self.uuid = id_args.download_args['uuid']

            tableone = self.get_tables(self.uuid + '&tab=01')
            self.get_json_one(self.one_dict, tableone)
            tabletwo = self.get_tables(self.uuid + '&tab=02')
            self.get_json_two(self.two_dict, tabletwo)
            tablethree = self.get_tables(self.uuid + '&tab=03')
            self.get_json_three(self.three_dict, tablethree)
            tablefour = self.get_tables(self.uuid + '&tab=06')
            self.get_json_four(self.four_dict, tablefour)

            CrawlerUtils.json_dump_to_file(
                'yunnan.json', {self.ent_number: self.result_json_dict})
            print json.dumps({self.ent_number: self.result_json_dict})
            return [{self.ent_number: self.result_json_dict}]
        else:
            #创建目录
            html_restore_path = self.json_restore_path + '/yunnan/'
            if not os.path.exists(html_restore_path):
                os.makedirs(html_restore_path)

            self.uuid = self.get_id_num(findCode)
            if self.uuid is None:
                return json.dumps({self.ent_number: {}})
            self.result_json_dict_list = []
            for div in BeautifulSoup(self.after_crack_checkcode_page,
                                     'html.parser').find_all(
                                         'div', attrs={'class': 'list-item'}):
                hrefa = div.find_all('a', attrs={'target': '_blank'})[0]
                if hrefa:
                    self.uuid = hrefa['href'].split('&')[0]
                    self.enterprise_name = div.find_all(
                        'div', attrs={'class': 'link'})[0].get_text().strip()
                    self.ent_number = div.find_all(
                        'span')[0].get_text().strip()

                    args =  CrawlerDownloadArgs.objects.filter(register_number=self.ent_number)\
                       or CrawlerDownloadArgs.objects.filter(unifield_number=self.ent_number).first() \
                       or CrawlerDownloadArgs.objects.filter(enterprise_name=self.ent_number).first()
                    if args:
                        args.delete()
                    args = CrawlerDownloadArgs(
                        province='yunnan',
                        register_number=self.ent_number,
                        unifield_number=self.ent_number,
                        enterprise_name=self.enterprise_name,
                        download_args={'uuid': self.uuid})
                    args.save()
                else:
                    continue
                print self.uuid
                self.result_json_dict = {}

                tableone = self.get_tables(self.uuid + '&tab=01')
                self.get_json_one(self.one_dict, tableone)
                tabletwo = self.get_tables(self.uuid + '&tab=02')
                self.get_json_two(self.two_dict, tabletwo)
                tablethree = self.get_tables(self.uuid + '&tab=03')
                self.get_json_three(self.three_dict, tablethree)
                tablefour = self.get_tables(self.uuid + '&tab=06')
                self.get_json_four(self.four_dict, tablefour)

                CrawlerUtils.json_dump_to_file(
                    'yunnan.json', {self.ent_number: self.result_json_dict})
                print json.dumps({self.ent_number: self.result_json_dict})
                self.result_json_dict_list.append(
                    {self.ent_number: self.result_json_dict})
            return self.result_json_dict_list

コード例 #11

0

ファイルを表示

ファイル: zongju_crawler.py プロジェクト: xiaohui2856/crawl

 def test_parse_annual_report_page(self):
     with open('./enterprise_crawler/zongju/annual_report.html') as f:
         page = f.read()
         result = self.parser.parse_ent_pub_annual_report_page(page)
         CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path,
                                        {self.crawler.ent_number: result})

コード例 #12

0

ファイルを表示

ファイル: run.py プロジェクト: xiaohui2856/crawl

            for line in f.readlines():
                # print type(json.loads(line)['list'])
                process = multiprocessing.Process(
                    target=get_pdf,
                    args=(json_file_item, json.loads(line)['list']))
                process.daemon = True
                process.start()
                process.join(max_crawl_time)
                print 'child process exit'
                # settings.logger.info('child process exit')
            f.close()
            need_dict = {}
            for pdf_item in os.listdir(
                    '%s/%s/%s/%s/' %
                (settings.pdf_restore_dir, json_file_item[:4],
                 json_file_item[4:6], json_file_item[6:])):
                # print pdf_item
                if pdf_item.split('.')[1] == 'pdf':
                    need_dict[
                        'http://rmfygg.court.gov.cn/psca/lgnot/bulletin/download/'
                        + pdf_item] = '%s/%s/%s/%s/%s' % (
                            os.path.abspath(os.curdir), json_file_item[:4],
                            json_file_item[4:6], json_file_item[6:], pdf_item)
                    # print pdf_item
            CrawlerUtils.json_dump_to_file(
                '%s/%s/%s/%s/%s%s' %
                (settings.pdf_restore_dir, json_file_item[:4],
                 json_file_item[4:6], json_file_item[6:], json_file_item,
                 '_pdf.json'), need_dict)
            # need_dict = {}