Python CrawlerUtils.make_dir Exemples, crawler.CrawlerUtils.make_dir Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : guangxi_crawler.py Projet : xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.result_json_dict = {}
        self.id = self.get_id_num(findCode)
        print self.id
        resp = self.reqst.get('http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_one(self.one_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_two(self.two_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_three(self.three_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id, timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_four(self.four_dict, soup.find_all('table'))

        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})

Exemple #2

0

Afficher le fichier

Fichier : run.py Projet : xiaohui2856/crawl

def main():
    config_logging()

    if not os.path.exists(settings.json_restore_path):
        CrawlerUtils.make_dir(settings.json_restore_path)

    cur_date = CrawlerUtils.get_cur_y_m_d()
    set_codecracker()

    if len(sys.argv) >= 2 and sys.argv[1] == "check":
        dt = None
        if len(sys.argv) == 3:
            dt = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d")
        checker = Checker(dt)
        checker.run()
        return

    if len(sys.argv) < 3:
        print 'usage: run.py [check] [max_crawl_time(minutes) province...] \n\tmax_crawl_time 最大爬取秒数，以秒计;\n\tprovince 是所要爬取的省份列表 用空格分开, all表示爬取全部)'
        return

    try:
        max_crawl_time = int(sys.argv[1])
        settings.max_crawl_time = datetime.timedelta(minutes=max_crawl_time)
    except ValueError as e:
        settings.logger.error('invalid max_crawl_time, should be a integer')
        os._exit(1)

    timer = threading.Timer(max_crawl_time, force_exit)
    timer.start()

    settings.logger.info(u'即将开始爬取，最长爬取时间为 %s 秒' % settings.max_crawl_time)
    settings.start_crawl_time = datetime.datetime.now()

    if sys.argv[2] == 'all':
        args = [p for p in sorted(province_crawler.keys())]
        process_pool = MyPool()
        process_pool.map(crawl_province, args)
        process_pool.close()
        settings.logger.info("wait processes....")
        process_pool.join()
    else:
        provinces = sys.argv[2:]
        for p in provinces:
            if not p in province_crawler.keys():
                settings.logger.warn('province %s is not supported currently' %
                                     p)
                continue

            crawl_province(p)

Exemple #3

0

Afficher le fichier

def down_yesterday_pdf(yesterday):
    yesterday = yesterday
    abs_yesterday_json_url = '%s/%s/%s/%s/%s' % (settings.host, settings.ID,
                                                 yesterday[:4], yesterday[4:6],
                                                 yesterday[6:])
    # print 'abs_yesterday_json_url:', abs_yesterday_json_url
    need_down_json_file_name = get_need_down_json_file_name(
        abs_yesterday_json_url)
    if need_down_json_file_name is None:
        print '-error__from_%s____no_data' % abs_yesterday_json_url
        return
    else:
        abs_yesterday_json_url = '%s/%s' % (abs_yesterday_json_url,
                                            need_down_json_file_name)
        # print 'abs_yesterday_json_url:',abs_yesterday_json_url
        abs_json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_dir,
                                                yesterday[:4], yesterday[4:6],
                                                yesterday[6:])
        if not os.path.exists(abs_json_restore_dir):
            CrawlerUtils.make_dir(abs_json_restore_dir)
        abs_pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir,
                                               yesterday[:4], yesterday[4:6],
                                               yesterday[6:])
        if not os.path.exists(abs_pdf_restore_dir):
            CrawlerUtils.make_dir(abs_pdf_restore_dir)
        # print 'abs_json_restore_dir:', abs_json_restore_dir
        get_json_file_OK = get_data_json_file(abs_yesterday_json_url,
                                              abs_json_restore_dir,
                                              need_down_json_file_name)
        if get_json_file_OK is False:
            print '-error--nodata_from_%s%s' % (abs_json_restore_dir,
                                                need_down_json_file_name)
            return
        else:
            abs_yesterday_json_gz_file_name = '%s/%s' % (
                abs_json_restore_dir, need_down_json_file_name)
            abs_yesterday_json_file_name = '%s/%s%s' % (abs_json_restore_dir,
                                                        yesterday, '.json')
            # print 'abs_yesterday_json_file_name:',abs_yesterday_json_file_name
            # print 'abs_yesterday_json_gz_file_name:', abs_yesterday_json_gz_file_name
            g = gzip.GzipFile(mode='rb',
                              fileobj=open(abs_yesterday_json_gz_file_name,
                                           'rb'))
            open(abs_yesterday_json_file_name, 'wb').write(g.read())
            if os.path.isfile(abs_yesterday_json_gz_file_name):
                os.remove(abs_yesterday_json_gz_file_name)
            get_pdfs_from_data_json(abs_pdf_restore_dir,
                                    abs_yesterday_json_file_name)
    pass

Exemple #4

0

Afficher le fichier

Fichier : run.py Projet : xiaohui2856/crawl

def crawl_province(province):
    settings.logger.info('ready to clawer %s' % province)
    #创建存储路径
    json_restore_dir = '%s/%s/%s/%s' % (settings.json_restore_path, province,
                                        cur_date[0], cur_date[1])
    if not os.path.exists(json_restore_dir):
        CrawlerUtils.make_dir(json_restore_dir)

    #获取企业名单
    enterprise_list_path = settings.enterprise_list_path + province + '.txt'

    #json存储文件名
    json_restore_path = '%s/%s.json' % (json_restore_dir, cur_date[2])

    with open(enterprise_list_path) as f:
        for line in f:
            fields = line.strip().split(",")
            if len(fields) < 3:
                continue
            no = fields[2]
            process = multiprocessing.Process(target=crawl_work,
                                              args=(province,
                                                    json_restore_path, no))
            process.daemon = True
            process.start()
            process.join(300)

    settings.logger.info('All %s crawlers work over' % province)

    #压缩保存
    if not os.path.exists(json_restore_path):
        settings.logger.warn('json restore path %s does not exist!' %
                             json_restore_path)
        os._exit(1)
        return

    with open(json_restore_path, 'r') as f:
        data = f.read()
        compressed_json_restore_path = json_restore_path + '.gz'
        with gzip.open(compressed_json_restore_path, 'wb') as cf:
            cf.write(data)

    #删除json文件，只保留  .gz 文件
    os.remove(json_restore_path)
    os._exit(0)

Exemple #5

0

Afficher le fichier

    def run(self, ent_name=None):
        if ent_name is None:
            return False
        crawler = NameToIDCrawler(
            './enterprise_crawler/nametoid/name_to_id.json')
        crawler.ent_name = str(ent_name).strip(' ').strip('\n').strip(' ')
        # 对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + crawler.ent_name + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        page = crawler.crawl_page_by_get_params(crawler.ent_name)
        crawler.results = crawler.parser.parse_search_page(page=page)
        # 采用多线程，在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_name: crawler.results})
        self.write_file_mutex.release()
        return True

Exemple #6

0

Afficher le fichier

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.uuid = self.get_id_num(findCode)
        print self.uuid
        self.result_json_dict = {}

        tableone = self.get_tables(self.uuid + '&tab=01')
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.uuid + '&tab=02')
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.uuid + '&tab=03')
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.uuid + '&tab=06')
        self.get_json_four(self.four_dict, tablefour)

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})

Exemple #7

0

Afficher le fichier

Fichier : guangxi_crawler.py Projet : xiaohui2856/clawer

    def run(self, findCode):

        self.ent_number = str(findCode)

        if not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.result_json_dict = {}
        self.id = self.get_id_num(findCode)
        if self.id is None:
            return sjon.dumps({self.ent_number: {}})
        # print self.id
        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/businessPublicity.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_one(self.one_dict, soup.find_all('table'))

        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/enterprisePublicity.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_two(self.two_dict, soup.find_all('table'))

        resp = self.reqst.get('http://gxqyxygs.gov.cn/otherDepartment.jspx?' +
                              self.id,
                              timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_three(self.three_dict, soup.find_all('table'))

        resp = self.reqst.get(
            'http://gxqyxygs.gov.cn/justiceAssistance.jspx?' + self.id,
            timeout=120)
        soup = BeautifulSoup(resp.content)
        self.get_json_four(self.four_dict, soup.find_all('table'))

        return json.dumps({self.ent_number: self.result_json_dict})

Exemple #8

0

Afficher le fichier

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.id = self.get_id_num(findCode)
        print self.id
        self.result_json_dict = {}
        #self.result_json_dict[findCode] = {}
        tableone = self.get_tables(self.mysearchdict['businessPublicity'] + 'id=' + self.id)
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.mysearchdict['enterprisePublicity'] + 'id=' + self.id)
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.mysearchdict['otherDepartment'] + 'id=' + self.id)
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.mysearchdict['justiceAssistance'] + 'id=' + self.id)
        self.get_json_four(self.four_dict, tablefour)

        #self.write_file_mutex.acquire()
        print {self.ent_number: self.result_json_dict}
        CrawlerUtils.json_dump_to_file(self.json_restore_path, {self.ent_number: self.result_json_dict})

Exemple #9

0

Afficher le fichier

    def run(self, ent_number=0):
        crawler = ChongqingClawer(
            './enterprise_crawler/chongqing/chongqing.json')

        crawler.ent_number = str(ent_number)
        # 对每个企业都指定一个html的存储目录
        crawler.html_restore_path = self.html_restore_path + crawler.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)
        crawler.ent_number = str(ent_number)
        page = crawler.crawl_check_page()
        try:
            crawler.crawl_page_jsons(page)
            crawler.parser.parse_jsons()
            crawler.parser.merge_jsons()
        except Exception as e:
            # settings.logger.error('error')
            return False
        # 采用多线程，在写入文件时需要注意加锁
        self.write_file_mutex.acquire()
        CrawlerUtils.json_dump_to_file(self.json_restore_path,
                                       {crawler.ent_number: crawler.json_dict})
        self.write_file_mutex.release()
        return True

Exemple #10

0

Afficher le fichier

Fichier : run.py Projet : xiaohui2856/crawl

def get_pdf(save_path, list_dict):

    pdf_restore_dir = '%s/%s/%s/%s' % (settings.pdf_restore_dir, save_path[:4],
                                       save_path[4:6], save_path[6:])
    if not os.path.exists(pdf_restore_dir):
        CrawlerUtils.make_dir(pdf_restore_dir)

    for item in list_dict:
        pdf_url = item['pdf_url']
        count = 0
        while count < 10:
            resp = reqst.get(pdf_url)
            if resp.status_code == 200 and resp.content:
                with open(
                        os.path.join(pdf_restore_dir,
                                     pdf_url.rsplit('/')[-1]), 'wb') as f:
                    f.write(resp.content)
                break
            else:
                count += 1
                if count == 10:
                    print '%s,get-error' % pdf_url
                    # settings.logger.info('%s,get-error' % pdf_url)
                continue

Exemple #11

0

Afficher le fichier

Fichier : henan_crawler.py Projet : xiaohui2856/clawer

    def run(self, findCode):
        self.ent_number = str(findCode)
        if not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.id = self.get_id_num(findCode)
        if self.id is None:
            return json.dumps({self.ent_number: {}})
        # print self.id
        self.result_json_dict = {}
        tableone = self.get_tables(self.search_dict['businessPublicity'] +
                                   'id=' + self.id)
        self.get_json_one(self.one_dict, tableone)
        tabletwo = self.get_tables(self.search_dict['enterprisePublicity'] +
                                   'id=' + self.id)
        self.get_json_two(self.two_dict, tabletwo)
        tablethree = self.get_tables(self.search_dict['otherDepartment'] +
                                     'id=' + self.id)
        self.get_json_three(self.three_dict, tablethree)
        tablefour = self.get_tables(self.search_dict['justiceAssistance'] +
                                    'id=' + self.id)
        self.get_json_four(self.four_dict, tablefour)

        return json.dumps({self.ent_number: self.result_json_dict})

Exemple #12

0

Afficher le fichier

Fichier : sichuan_crawler.py Projet : xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        self.pripid = self.get_id_num(findCode)
        print findCode, self.pripid
        self.result_json_dict = {}

        data = {
            'method': 'qyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk1',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        # print BeautifulSoup(resp.content).prettify
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'基本信息', u'股东信息', u'变更信息')

        data = {
            'method': 'baInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk2',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'主要人员信息', u'分支机构信息', u'清算信息')

        data = {
            'method': 'dcdyInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=120)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'动产抵押登记信息')

        data = {
            'method': 'gqczxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk4',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权出质登记信息')

        data = {
            'method': 'jyycInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk6',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'经营异常信息')

        data = {
            'method': 'yzwfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk14',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'严重违法信息')

        data = {
            'method': 'cfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk3',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'ccjcInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk7',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.one_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'抽查检查信息')

        data = {
            'method': 'qygsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk8',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'企业年报')

        data = {
            'method': 'qygsForTzrxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk12',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股东及出资信息', u'变更信息')

        data = {
            'method': 'cqygsForTzrbgxxInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk15',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'股权变更信息')

        data = {
            'method': 'qygsForXzxkInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk10',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qygsForZzcqInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk11',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'知识产权出质登记信息')

        data = {
            'method': 'qygsForXzcfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk13',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.two_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'qtgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk9',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政许可信息')

        data = {
            'method': 'qtgsForCfInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk16',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.three_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'行政处罚信息')

        data = {
            'method': 'sfgsInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk17',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股权冻结信息')

        data = {
            'method': 'sfgsbgInfo',
            'maent.pripid': self.pripid,
            'czmk': 'czmk18',
            'random': self.cur_time
        }
        resp = self.reqst.post('http://gsxt.scaic.gov.cn/ztxy.do',
                               data=data,
                               timeout=180)
        self.get_json_one(self.four_dict,
                          BeautifulSoup(resp.content).find_all('table'),
                          u'司法股东变更登记信息')

        self.result_json_dict[
            'ind_comm_pub_reg_basic'] = self.result_json_dict[
                'ind_comm_pub_reg_basic'][0]
        if 'ind_comm_pub_arch_liquidation' in self.result_json_dict.keys(
        ) and len(self.result_json_dict['ind_comm_pub_arch_liquidation']) > 0:
            self.result_json_dict[
                'ind_comm_pub_arch_liquidation'] = self.result_json_dict[
                    'ind_comm_pub_arch_liquidation'][0]
        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})

Exemple #13

0

Afficher le fichier

Fichier : guizhou_crawler.py Projet : xiaohui2856/crawl

    def run(self, findCode):

        self.ent_number = str(findCode)
        #对每个企业都指定一个html的存储目录
        self.html_restore_path = self.html_restore_path + self.ent_number + '/'
        if settings.save_html and not os.path.exists(self.html_restore_path):
            CrawlerUtils.make_dir(self.html_restore_path)

        nbxh = self.get_id_num(findCode)
        self.nbxh = nbxh

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '5')
        print result_dict
        self.get_json_one(allths=[
            u'注册号/统一社会信用代码', u'名称', u'类型', u'法定代表人', u'注册资本', u'成立日期', u'住所',
            u'营业期限自', u'营业期限至', u'经营范围', u'登记机关', u'核准日期', u'登记状态'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'zch', u'qymc', u'qylxmc', u'fddbr', u'zczb',
                              u'clrq', u'zs', u'yyrq1', u'yyrq2', u'jyfw',
                              u'djjgmc', u'hzrq', u'mclxmc'
                          ],
                          head='ind_comm_pub_reg_basic')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '3')
        print result_dict
        self.get_json_one(allths=[u'变更事项', u'变更前内容', u'变更后内容', u'变更日期'],
                          alltds=result_dict,
                          alltds_keys=[u'bcsxmc', u'bcnr', u'bghnr', u'hzrq'],
                          head='ind_comm_pub_reg_modify')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '2',
            '3')
        print result_dict
        self.get_json_one(
            allths=[u'股东类型', u'股东', u'证照/证件类型', u'证照/证件号码', u'详情'],
            alltds=result_dict,
            alltds_keys=[u'tzrlxmc', u'czmc', u'zzlxmc', u'zzbh'],
            head='ind_comm_pub_reg_shareholder')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '8')
        print result_dict
        self.get_json_one(allths=[u'序号', u'姓名', u'职务'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'xm', u'zwmc'],
                          head='ind_comm_pub_arch_key_persons')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '36')
        print result_dict
        self.get_json_one(allths=[u'清算负责人', u'清算组成员'],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_arch_liquidation')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '9')
        print result_dict
        self.get_json_one(
            allths=[u'序号', u'注册号/统一社会信用代码', u'名称', u'登记机关'],
            alltds=result_dict,
            alltds_keys=[u'rownum', u'fgszch', u'fgsmc', u'fgsdjjgmc'],
            head='ind_comm_pub_arch_branch')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '25')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'登记日期', u'登记机关', u'被担保债权数额', u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_movable_property_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '4')
        print result_dict
        self.get_json_one(allths=[
            u'序号', u'登记编号', u'出质人', u'证照/证件号码', u'出质股权数额', u'质权人', u'证照/证件号码',
            u'股权出质设立登记日期', u'状态', u'公示日期', u'变化情况'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_equity_ownership_reg')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '1')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_administration_sanction')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '33')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_business_exception')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '34')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_serious_violate_law')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '35')
        print result_dict
        self.get_json_one(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ind_comm_pub_spot_check')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '13')
        print result_dict
        self.get_json_two(allths=[u'序号', u'详情', u'报送年度', u'发布日期'],
                          alltds=result_dict,
                          alltds_keys=[u'rownum', u'lsh', u'nd', u'rq'],
                          head='ent_pub_ent_annual_report')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '40')
        print result_dict
        self.get_json_two(allths=[
            u'股东', u'认缴额（万元）', u'实缴额（万元）', u'认缴出资方式', u'认缴出资额（万元）', u'认缴出资日期',
            u'认缴公示日期', u'实缴出资方式', u'实缴出资额（万元）', u'实缴出资日期', u'实缴公示日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'tzrmc', u'ljrje', u'ljsje', u'rjczfs',
                              u'rjcze', u'rjczrq', u'rjgsrq', u'sjczfs',
                              u'sjcze', u'sjczrq', u'sjgsrq'
                          ],
                          head='ent_pub_shareholder_capital_contribution')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '23')
        print result_dict
        self.get_json_two(
            allths=[u'序号', u'股东', u'变更前股权比例', u'变更后股权比例', u'股权变更日期', u'公示日期'],
            alltds=result_dict,
            alltds_keys=[],
            head='ent_pub_equity_change')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '20')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'许可机关', u'许可内容',
            u'状态', u'公示日期', u'详情'
        ],
                          alltds=result_dict,
                          alltds_keys=[
                              u'rownum', u'xkwjbh', u'xkwjmc', u'ksyxqx',
                              u'jsyxqx', u'xkjg', u'xknr', u'zt', u'gsrq',
                              u'lsh'
                          ],
                          head='ent_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '21')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_knowledge_property')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '22')
        print result_dict
        self.get_json_two(allths=[],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='ent_pub_shareholder_modify')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '37')
        print result_dict
        self.get_json_three(allths=[
            u'序号', u'许可文件编号', u'许可文件名称', u'有效期自', u'有效期至', u'有效期', u'许可机关',
            u'许可内容', u'状态', u'详情'
        ],
                            alltds=result_dict,
                            alltds_keys=[
                                u'rownum', u'xkwjbh', u'xkwjmc', u'yxq1',
                                u'yxq2', u'yxq', u'xkjg', u'xknr', u'zt', u'zt'
                            ],
                            head='other_dept_pub_administration_license')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchOldData.shtml', nbxh,
            '0', '38')
        print result_dict
        self.get_json_two(allths=[
            u'序号', u'行政处罚决定书文号', u'违法行为类型', u'行政处罚内容', u'作出行政处罚决定机关名称',
            u'作出行政处罚决定日期'
        ],
                          alltds=result_dict,
                          alltds_keys=[],
                          head='other_dept_pub_administration_sanction')

        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '49')
        print result_dict
        self.get_json_four(allths=[
            u'序号', u'被执行人', u'股权数额', u'执行法院', u'协助公示通知书文号', u'状态', u'详情'
        ],
                           alltds=result_dict,
                           alltds_keys=[],
                           head='judical_assist_pub_equity_freeze')
        result_dict = self.send_post(
            'http://gsxt.gzgs.gov.cn/nzgs/search!searchData.shtml', nbxh, '0',
            '53')
        print result_dict
        self.get_json_four(
            allths=[u'序号', u'被执行人', u'股权数额', u'受让人', u'执行法院', u'详情'],
            alltds=result_dict,
            alltds_keys=[],
            head='judical_assist_pub_shareholder_modify')

        CrawlerUtils.json_dump_to_file(
            self.json_restore_path, {self.ent_number: self.result_json_dict})