class IfrsOperatingRevenueAssembler():
    def __init__(self):
        self.base_xpath = '//html/body'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        self.content_screener.screen(param)
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return OperatingRevenueDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        # traverse and sanity check
        th_texts = tr_tags[0].xpath('./th/text()')
        assert len(th_texts) == 2, 'invalid th_texts size, should be 2'
        # should be account
        account = th_texts[0]

        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table[@class="noBorder"]')
        assert len(table_tags) > 0, 'invalid table_tags'
        td_tags = table_tags[2].xpath('./td')
        assert len(td_tags) > 0, 'invalid td_tags'
        # should be snapdate
        snapdate = self.string_utils.from_local_string_to_date(td_tags[1].text)

        return [account, snapdate]

    def __assemble_row_list(self, relative_html_object):
        # skip one row of column name list
        tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr')[1:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        # should be item
        th_texts = relative_html_object.xpath('./th/text()')
        assert len(th_texts) == 1, 'invalid th_texts size, should be 1'
        item = th_texts[0]

        # should be number (operating revenue)
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(th_texts) == 1, 'invalid td_texts size, should be 1'
        number_string = td_texts[0]
        number = self.string_utils.normalize_number(number_string)

        return [item, number]
class XbrlIncomeStatementAssembler():
    def __init__(self):
        self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return IncomeStatementDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr[@class="tblHead"]')
        assert len(tr_tags) == 2, 'invalid tr_tags'

        # traverse and sanity check        
        statement_th_texts = tr_tags[1].xpath('./th/text()')
        assert len(statement_th_texts) == 1, 'invalid statement_th_texts'
        assert unicode(statement_th_texts[0]) == u'綜合損益表', 'invalid statement_th_texts[0]'

        column_name_list = []
        
        # should be account type
        column_th_texts = tr_tags[0].xpath('./th/text()')
        account_type = column_th_texts[0] # of unicode type
        column_name_list.append(account_type)

        # should be date interval
        for local_string in column_th_texts[1:]:
            # of (datetime.date, datetime.date) type
            snapshot_date = self.string_utils.from_local_string_to_date_period(local_string) 
            column_name_list.append(snapshot_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of statement name and one row of column name list
        tr_tags = relative_html_object.xpath('./tr')[2:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        row = []

        td_texts = relative_html_object.xpath('./td/text()')

        # should be account type 
        account_type = td_texts[0].strip()
        row.append(account_type)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Exemple #3
0
class TaurusParser():
    def __init__(self):
        self.base_xpath = '//html/body/center'
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content) 
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        tr_tags = inner_table_tags[0].xpath('./tr')
        assert len(tr_tags) > 1, 'invalid tr_tags'

        th_texts = tr_tags[1].xpath('./th/text()')
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath('./tr')
            assert len(tr_tags) > 2, 'invalid tr_tags'
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath('./td/text()')
        # record contains extra entry about comment
        assert len(td_texts) == 11, 'invalid td_texts size, should be 11'

        items = td_texts[:2]

        numbers = []
        # skip the last entry about comment
        for td_text in td_texts[2:-1]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers    

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath('./div')
        assert len(div_tags) > 0, 'invalid div_tags'

        groups = self.string_utils.match(u'^出表日期:(.*)$', div_tags[-1].text.strip())
        assert len(groups) > 0, 'could not match ^出表日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date
class LegacyIncomeStatementAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/center/table'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return IncomeStatementDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        column_name_list = []

        # should be account type
        column_th_texts = tr_tags[3].xpath('./th/b/text()')
        account_type = column_th_texts[0] # of unicode type
        column_name_list.append(account_type)

        for local_string in column_th_texts[1:]:
            # of datetime.date type
            snapshot_date = self.string_utils.from_local_string_to_date(local_string) 
            column_name_list.append(snapshot_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip column part (5 rows)
        tr_tags = relative_html_object.xpath('./tr')[5:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        row = []

        td_texts = relative_html_object.xpath('./td/text()')
        td_texts = self.__remove_empty_string_from_string_list(td_texts)

        # should be account type 
        account_type = td_texts[0].strip()
        row.append(account_type)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row

    def __remove_empty_string_from_string_list(self, string_list):
        return [string for string in string_list if string.strip()]
class DividendPolicyAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return DividendPolicyDao(column_name_list, row_list, stock_symbol)

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        # skip the first row of header
        td_texts = tr_tags[2].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        # skip the first row of header
        # skip the second row of empty lines
        # skip the third row of column name list
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]]

    def __assemble_row(self, relative_html_object):
        # should be item
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number 
        for number_string in td_texts[1:-1]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        # should be number in percentage
        for number_string in td_texts[-1:]:
            number = self.string_utils.normalize_number(number_string) * 0.01
            row.append(number)

        return row
Exemple #6
0
class AriesParser():
    def __init__(self, text):
        self.text = text
        self.head_splitted_account = None
        self.account_utils = AccountUtils()
        self.string_utils = StringUtils()

    def parse(self):
        text = self.__preprocess_text(self.text)
        lines = self.__scan_lines(text)
        return self.__parse_lines(lines)

    def __preprocess_text(self, text):
        text = self.account_utils.concat_account(text)
        text = self.account_utils.remove_eten_separation(text)
        return text

    def __scan_lines(self, text):
        scanner = Scanner(Source(text))
        scanner.scan()
        tokens = scanner.get_tokens()

        lines = []
        tokens_in_line = []
        for token in tokens:
            tokens_in_line.append(token)
            if token.get_token_type() == 'TK_EOL':
                lines.append(tokens_in_line)
                tokens_in_line = []
        return lines

    def __parse_lines(self, lines):
        column_name_list = None
        visited_column_name_list = False
        row_list = []
        for line in lines:
            type_list = [token.get_token_type() for token in line]
            #print type_list
            # pass useless line
            if type_list == ['TK_EOL']:
                continue
            elif type_list == ['TK_SEPERATION', 'TK_EOL']:
                continue
            
            # try to parse column name list
            if not visited_column_name_list:
                if type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[:2])
                        visited_column_name_list = True
                    except Exception:
                        pass
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[1:3])
                        visited_column_name_list = True
                    except Exception:
                        pass
            # try to parse rest row list
            else:
                row = None
                if type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_paren_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_paren_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_EOL']:
                    row = self.__parse_account_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    continue
                else:
                    raise ValueError
                row_list.append(row) if row else None

        assert visited_column_name_list, 'We should parse column name list'
        assert len(row_list) > 0, 'We should parse some rows'
        return column_name_list, row_list   

    def __parse_column_name_list(self, stmt_date_list):
        column_name_list = [u'會計科目']
        assert len(stmt_date_list) == 2, 'There shouble be 2 statement dates' 
        for i in [0, 1]:
            date_period = self.string_utils.from_local_string_to_date_period(stmt_date_list[i].get_value())
            stmt_date = date_period[1]
            column_name_list.append(stmt_date)  
        return column_name_list        

    # ['TK_ACCOUNT', 'TK_EOL']
    def __parse_account_line(self, line):
        return [line[0].get_value()]

    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_number_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()), 
            self.string_utils.normalize_number(line[2].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_paren_number_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            self.string_utils.normalize_number(line[4].get_value())
        ]
        
    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()),
            -self.string_utils.normalize_number(line[3].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_paren_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            -self.string_utils.normalize_number(line[5].get_value())
        ]
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number("33,825,315")
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("0")
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("-115,859,592")
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(27,540)")
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("2.85")
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("170,270,395.00")
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(  10,117,111)")
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u"-")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"不適用")
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_chinese_number(self):
        actual = self.string_utils.normalize_number(u"九十九")
        expected = 99
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"九十")
        expected = 90
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"三")
        expected = 3
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u"20.92%")
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"2013年12月31日")
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2012年01月01日")
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date("1962/02/09")
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2015/08/13")
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"民國103年09月")
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"104")
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"99年09月30日")
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"102/05/07")
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_local_string_to_date_interval(self):
        actual = self.string_utils.from_local_string_to_date_period(u"2013年01月01日至2013年12月31日")
        expected = datetime.date(2013, 1, 1), datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date_period(self):
        actual = self.string_utils.from_local_string_to_date_period(u"九十八年前三季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"九十八年第一季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年第一季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年上半年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"99年上半年度")
        expected = datetime.date(2010, 1, 1), datetime.date(2010, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年前三季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 12, 31)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1))
        expected = "90"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31))
        expected = "10"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "01"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "02"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "02"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "03"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "03"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "04"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "04"
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "1"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "1"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "2"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "2"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "3"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "3"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "4"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "4"
        self.assertEqual(actual, expected)

    def test_is_match_seperation(self):
        pattern = u"^(-| |=)*$"
        self.assertTrue(self.string_utils.is_match(pattern, u"======      ======"))
        self.assertTrue(self.string_utils.is_match(pattern, u"------      ------"))
        self.assertFalse(self.string_utils.is_match(pattern, u"同時影響現金及非現金項目之投資活動:"))

    def test_match_account(self):
        pattern = u"^([^\s]*):$"
        actual = self.string_utils.match(pattern, u"營業活動之現金流量:")
        expected = [u"營業活動之現金流量"]
        self.assertEqual(actual, expected)
Exemple #8
0
class GeminiParser:
    def __init__(self):
        self.base_xpath = "//html/body/center"
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u"<br>", u"")
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, "invalid base_xpath"
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        tr_tags = inner_table_tags[0].xpath("./tr")
        assert len(tr_tags) > 1, "invalid tr_tags"

        th_texts = tr_tags[1].xpath("./th/text()")
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath("./tr")
            assert len(tr_tags) > 2, "invalid tr_tags"
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath("./td/text()")
        assert len(td_texts) == 10, "invalid td_texts size, should be 10"

        items = td_texts[:2]

        numbers = []
        for td_text in td_texts[2:]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath("./div")
        assert len(div_tags) > 0, "invalid div_tags"

        groups = self.string_utils.match(u"^出表日期:(.*)$", div_tags[-1].text.strip())
        assert len(groups) > 0, "could not match ^出表日期:(.*)$"

        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date