Exemple #1
0
        'open_detail_info_entry': ''
    }

    def __init__(self, json_restore_path):
        ZongjuCrawler.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = HunanParser(self)


class HunanParser(ZongjuParser):
    def __init__(self, crawler):
        self.crawler = crawler


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    HunanCrawler.code_cracker = CaptchaRecognition('hunan')

    crawler = HunanCrawler('./enterprise_crawler/hunan.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/hunan.txt')
    # enterprise_list = ['430000000011972']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###################   Start to crawl enterprise with id %s   ###################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
Exemple #2
0
class TestParser(unittest.TestCase):
    def setUp(self):
        unittest.TestCase.setUp(self)
        from CaptchaRecognition import CaptchaRecognition
        self.crawler = ChongqingClawer('./enterprise_crawler/chongqing.json')
        self.parser = self.crawler.parser
        ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
        self.crawler.json_dict = {}
        self.crawler.ent_number = '500232000003942'


if __name__ == '__main__':

    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")
    from CaptchaRecognition import CaptchaRecognition

    ChongqingClawer.code_cracker = CaptchaRecognition('chongqing')
    crawler = ChongqingClawer('./enterprise_crawler/chongqing/chongqing.json')
    start_time = time.localtime()
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/chongqing.txt')
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        print(
            '############   Start to crawl enterprise with id %s   ################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
Exemple #3
0
            page = f.read()
            result = self.parser.parse_ent_pub_annual_report_page(page)
            CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path,
                                           {self.crawler.ent_number: result})

    def test_parse_shareholder_detail_page(self):
        with open('./enterprise_crawler/zongju/shareholder_detail.html') as f:
            page = f.read()
            result = self.parser.parse_ind_comm_pub_shareholder_detail_page(
                page)
            CrawlerUtils.json_dump_to_file(self.crawler.json_restore_path,
                                           {self.crawler.ent_number: result})


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    ZongjuCrawler.code_cracker = CaptchaRecognition('zongju')

    crawler = ZongjuCrawler('./enterprise_crawler/zongju.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/zongju.txt')
    # enterprise_list = ['100000000018305']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###############   Start to crawl enterprise with id %s   ################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
Exemple #4
0
        'http://218.26.1.108/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = ShanxiParser(self)


class ShanxiParser(HeilongjiangParser):
    def __init__(self, crawler):
        self.crawler = crawler


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    ShanxiCrawler.code_cracker = CaptchaRecognition('shanxi')

    crawler = ShanxiCrawler('./enterprise_crawler/shanxi.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/shanxi.txt')
    # enterprise_list = ['310000000007622']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###################   Start to crawl enterprise with id %s   ###################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
Exemple #5
0
        'http://gsxt.xzaic.gov.cn/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = XizangParser(self)


class XizangParser(HeilongjiangParser):
    def __init__(self, crawler):
        self.crawler = crawler


if __name__ == '__main__':
    # from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    # XizangCrawler.code_cracker = CaptchaRecognition('xizang')

    crawler = XizangCrawler('./enterprise_crawler/xizang.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/xizang.txt')
    #enterprise_list = ['5400001000374']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###################   Start to crawl enterprise with id %s   ###################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)
            table_tds = tr.find_all("td")
            for td in table_tds:
                if 'colspan' in td.attrs:
                    continue
                else:
                    table_td.append(td.text.strip())
            if table_td:
                for i in range(0, len(table_th)):
                    table_save = {}
                    table_save[table_th[i]] = table_td[i]
                    total.append(table_save)

        return total


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    LiaoningCrawler.code_cracker = CaptchaRecognition('liaoning')

    crawler = LiaoningCrawler('./enterprise_crawler/liaoning.json')
    enterprise_list = CrawlerUtils.get_enterprise_list('./enterprise_list/liaoning.txt')
    # enterprise_list = ['210000004920321']
    # enterprise_list = ['210200400016720']
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info('###################   Start to crawl enterprise with id %s   ###################\n' %
                             ent_number)
        crawler.run(ent_number=ent_number)
Exemple #7
0
        'http://218.95.241.36/QueryYearExamineDetail.jspx?id=',  # 企业年报详情
    }

    def __init__(self, json_restore_path):
        HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        self.parser = QinghaiParser(self)


class QinghaiParser(HeilongjiangParser):
    def __init__(self, crawler):
        self.crawler = crawler


if __name__ == '__main__':
    from CaptchaRecognition import CaptchaRecognition
    import run
    run.config_logging()
    QinghaiCrawler.code_cracker = CaptchaRecognition('qinghai')

    crawler = QinghaiCrawler('./enterprise_crawler/qinghai.json')
    enterprise_list = CrawlerUtils.get_enterprise_list(
        './enterprise_list/qinghai.txt')
    # enterprise_list = ["630000400003574"]
    for ent_number in enterprise_list:
        ent_number = ent_number.rstrip('\n')
        settings.logger.info(
            '###################   Start to crawl enterprise with id %s   ###################\n'
            % ent_number)
        crawler.run(ent_number=ent_number)