Esempio n. 1
0
 def get_dict_table_items(self, table_tag):
     """获得字典类型的表格的结构
     """
     table_items = {}
     for tr in table_tag.find_all('tr'):
         if tr.find('th') and tr.find('td'):
             ths = tr.find_all('th')
             tds = tr.find_all('td')
             for index, td in enumerate(tds):
                 table_items[CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace('PAPERS_', '')]\
                     = CrawlerUtils.get_raw_text_in_bstag(ths[index])
     return table_items
Esempio n. 2
0
    def parse_ind_comm_pub_reg_modify_table(self, bs_table, table_name, page):
        """解析工商公示信息-注册信息-变更信息表格,由于含有详情页,需要单独处理
        """
        tbody = bs_table.find('tbody')
        if tbody:
            columns = self.get_columns_of_record_table(bs_table, page)
            column_size = len(columns)
            item_array = []

            for tr in tbody.find_all('tr'):
                if tr.find('td'):
                    col_count = 0
                    item = {}
                    for td in tr.find_all('td'):
                        if td.find('a'):
                            #try to retrieve detail link from page
                            next_url = self.get_detail_link(td.find('a'), page)
                            #has detail link
                            if next_url:
                                detail_page = self.crawler.crawl_page_by_url(
                                    next_url).content
                                detail_soup = BeautifulSoup(
                                    detail_page, 'html.parser')
                                before_modify_table = detail_soup.body.find_all(
                                    'table')[1]
                                table_data = self.parse_table(
                                    before_modify_table, 'before_modify',
                                    detail_page)
                                item[columns[col_count][0]] = self.parse_table(
                                    before_modify_table, 'before_modify',
                                    detail_page)
                                col_count += 1
                                after_modify_table = detail_soup.body.find_all(
                                    'table')[2]
                                item[columns[col_count][0]] = self.parse_table(
                                    after_modify_table, 'after_modify',
                                    detail_page)
                            else:
                                item[columns[col_count]
                                     [0]] = CrawlerUtils.get_raw_text_in_bstag(
                                         td)
                        else:
                            item[columns[col_count]
                                 [0]] = CrawlerUtils.get_raw_text_in_bstag(td)

                        col_count += 1
                        if col_count == column_size:
                            item_array.append(item.copy())
                            col_count = 0
            return item_array
Esempio n. 3
0
 def get_list_table_items(self, table_tag):
     """获取记录类型的表格的结构
     """
     table_items = {}
     if len(table_tag.find_all('tr')) != 3:
         print 'abnormal list table skeleton, table_tag = ', table_tag
         return table_items
     ths = table_tag.find_all('tr')[1].find_all('th')
     tds = table_tag.find_all('tr')[2].find_all('td')
     if len(ths) != len(tds):
         print 'abnormal list table skeleton, table_tag = ', table_tag
         return table_items
     for index, td in enumerate(tds):
         table_items[
             CrawlerUtils.get_raw_text_in_bstag(td).strip('{}').replace(
                 'PAPERS_',
                 '')] = CrawlerUtils.get_raw_text_in_bstag(ths[index])
     return table_items
Esempio n. 4
0
    def parse_list_table_without_sub_list(self, records_tag, table_name, page,
                                          columns):
        """提取没有子列的记录形式的表
        Args:
            records_tag: 表的记录标签,用beautiful soup 从html中提取出来的
            table_name: 表名
            page: 原始的html页面
            columns: 表的表头结构
        Returns:
            item_array: 提取出的表数据,列表形式,列表中的元素为一个python字典
        """
        item_array = []
        for tr in records_tag.find_all('tr'):
            col_count = 0
            item = {}
            for td in tr.find_all('td', recursive=False):
                if td.find('a'):
                    # try to retrieve detail link from page
                    next_url = self.get_detail_link(td.find('a'), page)
                    # has detail link
                    if next_url:
                        detail_page = self.crawler.crawl_page_by_url(next_url)
                        if table_name == 'ent_pub_ent_annual_report':
                            page_data = self.parse_ent_pub_annual_report_page(
                                detail_page)
                            item[u'报送年度'] = CrawlerUtils.get_raw_text_in_bstag(
                                td)
                            item[
                                u'详情'] = page_data  # this may be a detail page data
                        elif table_name == 'ind_comm_pub_reg_shareholder':
                            page_data = self.parse_ind_comm_pub_shareholder_detail_page(
                                detail_page)
                            item[u'详情'] = {u"投资人及出资信息": page_data}
                        else:
                            page_data = self.parse_page(
                                detail_page, table_name + '_detail')
                            item[columns[col_count][
                                0]] = page_data  # this may be a detail page data
                    else:
                        # item[columns[col_count]] = CrawlerUtils.get_raw_text_in_bstag(td)
                        item[columns[col_count][0]] = self.get_column_data(
                            columns[col_count][1], td)
                else:
                    item[columns[col_count][0]] = self.get_column_data(
                        columns[col_count][1], td)
                col_count += 1
            if item:
                item_array.append(item)

        return item_array
Esempio n. 5
0
 def parse_annual_report_shareholder_info(self, page):
     """解析年报信息中的投资人信息
     需要单独处理
     """
     shareholder_info = []
     record_columns = [
         u'股东', u'认缴出资额', u'认缴出资时间', u'认缴出资方式', u'实缴出资额', u'实缴出资时间',
         u'实缴出资方式'
     ]
     json_obj = json.loads(page)
     for record in json_obj.get('items'):
         if not record.get('D1'):
             continue
         result = {}
         soup = BeautifulSoup(record.get('D1'), 'html.parser')
         tds = soup.find_all('td')
         if not tds:
             continue
         for index, column in enumerate(record_columns):
             result[column] = CrawlerUtils.get_raw_text_in_bstag(tds[index])
         shareholder_info.append(result)
     return shareholder_info
Esempio n. 6
0
 def get_year_of_annual_report(page):
     soup = BeautifulSoup(page, 'html.parser')
     t = soup.body.find('table')
     return CrawlerUtils.get_raw_text_in_bstag(t.find('tr'))
Esempio n. 7
0
    def parse_annual_report_skeleton(self, page):
        """解析 企业年报页面结构
        """
        #企业基本信息
        soup = BeautifulSoup(page, 'html.parser')
        annual_report_table_items = {}
        tag = soup.find('div', attrs={'id': 'qyjbxx'})
        if not tag:
            print 'parse annual report skeleton failed, do not find qyjbxx table'
            return
        table = tag.find('table', attrs={'class': 'detailsList'})
        if table:
            ent_basic_info_table = {}
            for tr in table.find_all('tr'):
                if tr.find('th') and tr.find('td'):
                    ths = tr.find_all('th')
                    tds = tr.find_all('td')
                    for index, td in enumerate(tds):
                        ent_basic_info_table[td.get(
                            'id')] = CrawlerUtils.get_raw_text_in_bstag(
                                ths[index])
            self.parse_table_items[
                'annual_report_ent_basic_info'] = ent_basic_info_table

        #网站或网店信息
        table = soup.find('table',
                          attrs={
                              'id': 'web',
                              'name': 'applicationList1TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_web_info'] = self.get_list_table_items(table)

        #股东及出资信息
        table = soup.find('table',
                          attrs={
                              'id': 'touziren',
                              'name': 'applicationList4TAB'
                          })
        if table:
            shareholder_info_table = {}

        #对外投资信息
        table = soup.find('table',
                          attrs={
                              'id': 'duiwaitouzi',
                              'name': 'applicationList3TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_investment_abord_info'] = self.get_list_table_items(
                    table)

        #企业资产状况信息
        for table in soup.find_all('table'):
            tr = table.find('tr')
            if tr and tr.find('th') and tr.find('th').text == u'企业资产状况信息':
                ent_property_info_table = {}
                for tr in table.find_all('tr'):
                    if tr.find('th') and tr.find('td'):
                        ths = tr.find_all('th')
                        tds = tr.find_all('td')
                        for index, td in enumerate(tds):
                            ent_property_info_table[td.get(
                                'id')] = CrawlerUtils.get_raw_text_in_bstag(
                                    ths[index])
                self.parse_table_items[
                    'annual_report_ent_property_info'] = ent_property_info_table
                break

        #对外提供担保信息
        table = soup.find('table',
                          attrs={
                              'id': 'duiwaidanbao',
                              'name': 'applicationList6TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_external_guarantee_info'] = self.get_list_table_items(
                    table)

        #股权变更信息
        table = soup.find('table',
                          attrs={
                              'id': 'guquanchange',
                              'name': 'applicationList5TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_equity_modify_info'] = self.get_list_table_items(
                    table)

        #修改记录
        table = soup.find('table',
                          attrs={
                              'id': 'modifyRecord',
                              'name': 'applicationList2TAB'
                          })
        if table:
            self.parse_table_items[
                'annual_report_modify_record'] = self.get_list_table_items(
                    table)

        self.annual_report_skeleton_built = True