class IfrsOperatingRevenueAssembler():
    def __init__(self):
        self.base_xpath = '//html/body'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        self.content_screener.screen(param)
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return OperatingRevenueDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        # traverse and sanity check
        th_texts = tr_tags[0].xpath('./th/text()')
        assert len(th_texts) == 2, 'invalid th_texts size, should be 2'
        # should be account
        account = th_texts[0]

        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table[@class="noBorder"]')
        assert len(table_tags) > 0, 'invalid table_tags'
        td_tags = table_tags[2].xpath('./td')
        assert len(td_tags) > 0, 'invalid td_tags'
        # should be snapdate
        snapdate = self.string_utils.from_local_string_to_date(td_tags[1].text)

        return [account, snapdate]

    def __assemble_row_list(self, relative_html_object):
        # skip one row of column name list
        tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr')[1:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        # should be item
        th_texts = relative_html_object.xpath('./th/text()')
        assert len(th_texts) == 1, 'invalid th_texts size, should be 1'
        item = th_texts[0]

        # should be number (operating revenue)
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(th_texts) == 1, 'invalid td_texts size, should be 1'
        number_string = td_texts[0]
        number = self.string_utils.normalize_number(number_string)

        return [item, number]
Example #2
0
class Spider():
    def __init__(self):
        self.storage = SpiderStorage()
        self.string_utils = StringUtils()
        
    def crawl(self, param):
        param = self.__extend_param(param)
        url = self.build_url(param)
        key = self.build_key(param)
        self.storage.set(key, url)

    def is_crawled(self, param):
        param = self.__extend_param(param)
        key = self.build_key(param)
        return self.storage.contains(key)

    def get_crawled(self, param):
        param = self.__extend_param(param)
        key = self.build_key(param)
        return self.storage.get(key)

    def __extend_param(self, param):
        output = {}
        if 'stock_symbol' in param:
            output['stock_symbol'] = param['stock_symbol']
        if 'date' in param:
            date = param['date']
            output['roc_era'] = self.string_utils.from_date_to_roc_era_string(date)
            output['year'] = str(date.year)
            output['quarter'] = self.string_utils.from_date_to_2_digit_quarter_string(date)
            output['quarter_xbrl'] = self.string_utils.from_date_to_1_digit_quarter_string(date)
            output['month'] = self.string_utils.from_date_to_2_digit_month_string(date)
        if 'market_type' in param:
            market_type = param['market_type']
            output['market_type'] = market_type
            output['market_code'] = self.__extend_market_code(market_type)
            output['market_mode'] = self.__extend_market_mode(market_type)
        return output

    def __extend_market_code(self, market_type):
        code_map = {
            'stock_exchange_market' : 'sii',
            'otc_market' : 'otc',
        }
        return code_map[market_type]

    def __extend_market_mode(self, market_type):
        mode_map = {
            'stock_exchange_market' : '2',
            'otc_market' : '4',
        }
        return mode_map[market_type]

    def build_url(self, param):
        raise NotImplementedError

    def build_key(self, param):
        raise NotImplementedError
class LegacyCashFlowAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/table[@class="hasBorder"]/tr/td/pre'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        # Parse cash flow statement 
        column_name_list, row_list = self.__assemble_summary(relative_html_object.text)
        return CashFlowDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_summary(self, text):
        return AriesParser(text).parse()
        
 def __init__(self):
     self.base_xpath = '//html/body'
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
 def __init__(self):
     self.base_xpath = '//html/body/center/table'
     self.string_utils = StringUtils()
class LegacyIncomeStatementAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/center/table'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return IncomeStatementDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        column_name_list = []

        # should be account type
        column_th_texts = tr_tags[3].xpath('./th/b/text()')
        account_type = column_th_texts[0] # of unicode type
        column_name_list.append(account_type)

        for local_string in column_th_texts[1:]:
            # of datetime.date type
            snapshot_date = self.string_utils.from_local_string_to_date(local_string) 
            column_name_list.append(snapshot_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip column part (5 rows)
        tr_tags = relative_html_object.xpath('./tr')[5:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        row = []

        td_texts = relative_html_object.xpath('./td/text()')
        td_texts = self.__remove_empty_string_from_string_list(td_texts)

        # should be account type 
        account_type = td_texts[0].strip()
        row.append(account_type)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row

    def __remove_empty_string_from_string_list(self, string_list):
        return [string for string in string_list if string.strip()]
 def __init__(self):
     self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="result_table hasBorder"]'
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
 def __init__(self):
     self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
class DividendPolicyAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return DividendPolicyDao(column_name_list, row_list, stock_symbol)

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        # skip the first row of header
        td_texts = tr_tags[2].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        # skip the first row of header
        # skip the second row of empty lines
        # skip the third row of column name list
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]]

    def __assemble_row(self, relative_html_object):
        # should be item
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number 
        for number_string in td_texts[1:-1]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        # should be number in percentage
        for number_string in td_texts[-1:]:
            number = self.string_utils.normalize_number(number_string) * 0.01
            row.append(number)

        return row
Example #10
0
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number("33,825,315")
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("0")
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("-115,859,592")
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(27,540)")
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("2.85")
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("170,270,395.00")
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number("(  10,117,111)")
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u"-")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"")
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"不適用")
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_chinese_number(self):
        actual = self.string_utils.normalize_number(u"九十九")
        expected = 99
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"九十")
        expected = 90
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u"三")
        expected = 3
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u"20.92%")
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"2013年12月31日")
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2012年01月01日")
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date("1962/02/09")
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"2015/08/13")
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"民國103年09月")
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"104")
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u"99年09月30日")
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u"102/05/07")
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_local_string_to_date_interval(self):
        actual = self.string_utils.from_local_string_to_date_period(u"2013年01月01日至2013年12月31日")
        expected = datetime.date(2013, 1, 1), datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date_period(self):
        actual = self.string_utils.from_local_string_to_date_period(u"九十八年前三季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"九十八年第一季")
        expected = datetime.date(2009, 1, 1), datetime.date(2009, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年第一季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 3, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年上半年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"99年上半年度")
        expected = datetime.date(2010, 1, 1), datetime.date(2010, 6, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年前三季")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date_period(u"100年度")
        expected = datetime.date(2011, 1, 1), datetime.date(2011, 12, 31)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1))
        expected = "90"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31))
        expected = "10"
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "01"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "01"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "02"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "02"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "03"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "03"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "04"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "04"
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = "1"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = "1"
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = "2"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = "2"
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = "3"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = "3"
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = "4"
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = "4"
        self.assertEqual(actual, expected)

    def test_is_match_seperation(self):
        pattern = u"^(-| |=)*$"
        self.assertTrue(self.string_utils.is_match(pattern, u"======      ======"))
        self.assertTrue(self.string_utils.is_match(pattern, u"------      ------"))
        self.assertFalse(self.string_utils.is_match(pattern, u"同時影響現金及非現金項目之投資活動:"))

    def test_match_account(self):
        pattern = u"^([^\s]*):$"
        actual = self.string_utils.match(pattern, u"營業活動之現金流量:")
        expected = [u"營業活動之現金流量"]
        self.assertEqual(actual, expected)
class XbrlIncomeStatementAssembler():
    def __init__(self):
        self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol, date = param['content'], param['stock_symbol'], param['date']
        content = self.string_utils.normalize_string(content)
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return IncomeStatementDao(column_name_list, row_list, stock_symbol, date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr[@class="tblHead"]')
        assert len(tr_tags) == 2, 'invalid tr_tags'

        # traverse and sanity check        
        statement_th_texts = tr_tags[1].xpath('./th/text()')
        assert len(statement_th_texts) == 1, 'invalid statement_th_texts'
        assert unicode(statement_th_texts[0]) == u'綜合損益表', 'invalid statement_th_texts[0]'

        column_name_list = []
        
        # should be account type
        column_th_texts = tr_tags[0].xpath('./th/text()')
        account_type = column_th_texts[0] # of unicode type
        column_name_list.append(account_type)

        # should be date interval
        for local_string in column_th_texts[1:]:
            # of (datetime.date, datetime.date) type
            snapshot_date = self.string_utils.from_local_string_to_date_period(local_string) 
            column_name_list.append(snapshot_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of statement name and one row of column name list
        tr_tags = relative_html_object.xpath('./tr')[2:]
        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags]

    def __assemble_row(self, relative_html_object):
        row = []

        td_texts = relative_html_object.xpath('./td/text()')

        # should be account type 
        account_type = td_texts[0].strip()
        row.append(account_type)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Example #12
0
class TaurusParser():
    def __init__(self):
        self.base_xpath = '//html/body/center'
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content) 
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        tr_tags = inner_table_tags[0].xpath('./tr')
        assert len(tr_tags) > 1, 'invalid tr_tags'

        th_texts = tr_tags[1].xpath('./th/text()')
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 1, 'invalid table_tags'

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table')
        assert len(inner_table_tags) > 0, 'invalid inner_table_tags'

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath('./tr')
            assert len(tr_tags) > 2, 'invalid tr_tags'
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath('./td/text()')
        # record contains extra entry about comment
        assert len(td_texts) == 11, 'invalid td_texts size, should be 11'

        items = td_texts[:2]

        numbers = []
        # skip the last entry about comment
        for td_text in td_texts[2:-1]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers    

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath('./div')
        assert len(div_tags) > 0, 'invalid div_tags'

        groups = self.string_utils.match(u'^出表日期:(.*)$', div_tags[-1].text.strip())
        assert len(groups) > 0, 'could not match ^出表日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date
 def __init__(self):
     self.base_xpath = '//html/body'
     self.string_utils = StringUtils()
class StockSymbolAssembler():
    def __init__(self):
        self.base_xpath = '//html/body'
        self.string_utils = StringUtils()

    def assemble(self, param):
        content = self.string_utils.normalize_string(param['content'])
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        release_date = self.__assemble_release_date(relative_html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return StockSymbolDao(column_name_list, row_list, release_date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_release_date(self, relative_html_object):
        # try to get release date
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 0, 'invalid table_tags'

        headline_tags = table_tags[0].xpath('./h2')
        assert len(headline_tags) > 0, 'invalid headline_tags'
        
        headline_texts = headline_tags[1].xpath('./strong/center')
        groups = self.string_utils.match(u'^最近更新日期:(.*)$', headline_texts[0].text.strip())
        assert len(groups) > 0, 'could not match ^最近更新日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        # traverse and sanity check
        original_column_name_list = tr_tags[0].xpath('./td/text()')

        # handle the first column name: '有價證券代號及名稱'
        combined_column_name = original_column_name_list[0].strip() 
        assert combined_column_name == u'有價證券代號及名稱', 'should be 有價證券代號及名稱 in unicode'
        # the chinese character '及' means 'and' so we need to seperate this column name
        seperated_column_name_list = combined_column_name.split(u'及')
        assert len(seperated_column_name_list) == 2

        column_name_list = seperated_column_name_list + original_column_name_list[1:]
        assert len(column_name_list) == 8, 'invalid column_name_list size, should be 8'
        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of column name list
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')[1:]

        row_list = []
        for tr_tag in tr_tags:
            row = self.__assemble_row(tr_tag)
            # if there is only one cell '股票' in row, skip it
            if row:
                row_list.append(row)
        return row_list

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')

        # we could not handle empty string between td tag if we use xpath './td/text()' 
        # so we need to check each td.text one by one.
        td_texts = self.__get_lxml_text_list(td_tags)

        # if there is only one cell '股票', return None
        if len(td_texts) == 1:
            return None

        # sanity check
        assert len(td_texts) == 7

        # handle the first cell: '有價證券代號及名稱'
        # it should be seperated as stock symbol and stock name
        combined_cell = td_texts[0].strip()
        seperated_cell_list = combined_cell.split()
        assert len(seperated_cell_list) == 2

        # convert to datetime.date type
        listing_date = self.string_utils.from_local_string_to_date(td_texts[2])

        row = seperated_cell_list + [td_texts[1]] + [listing_date] + td_texts[3:]
        return row

    def __get_lxml_text_list(self, tag_list):
        text_list = []
        for tag in tag_list:
            if tag.text is None:
                text_list.append('')
            else:
                text_list.append(tag.text)
        return text_list
 def __init__(self):
     self.base_xpath = '//html/body/table[@class="hasBorder"]/tr/td/pre'
     self.string_utils = StringUtils()
Example #16
0
class AriesParser():
    def __init__(self, text):
        self.text = text
        self.head_splitted_account = None
        self.account_utils = AccountUtils()
        self.string_utils = StringUtils()

    def parse(self):
        text = self.__preprocess_text(self.text)
        lines = self.__scan_lines(text)
        return self.__parse_lines(lines)

    def __preprocess_text(self, text):
        text = self.account_utils.concat_account(text)
        text = self.account_utils.remove_eten_separation(text)
        return text

    def __scan_lines(self, text):
        scanner = Scanner(Source(text))
        scanner.scan()
        tokens = scanner.get_tokens()

        lines = []
        tokens_in_line = []
        for token in tokens:
            tokens_in_line.append(token)
            if token.get_token_type() == 'TK_EOL':
                lines.append(tokens_in_line)
                tokens_in_line = []
        return lines

    def __parse_lines(self, lines):
        column_name_list = None
        visited_column_name_list = False
        row_list = []
        for line in lines:
            type_list = [token.get_token_type() for token in line]
            #print type_list
            # pass useless line
            if type_list == ['TK_EOL']:
                continue
            elif type_list == ['TK_SEPERATION', 'TK_EOL']:
                continue
            
            # try to parse column name list
            if not visited_column_name_list:
                if type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[:2])
                        visited_column_name_list = True
                    except Exception:
                        pass
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    try:
                        column_name_list = self.__parse_column_name_list(line[1:3])
                        visited_column_name_list = True
                    except Exception:
                        pass
            # try to parse rest row list
            else:
                row = None
                if type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']:
                    row = self.__parse_account_paren_number_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']:
                    row = self.__parse_account_paren_number_paren_number_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_EOL']:
                    row = self.__parse_account_line(line)
                elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']:
                    continue
                else:
                    raise ValueError
                row_list.append(row) if row else None

        assert visited_column_name_list, 'We should parse column name list'
        assert len(row_list) > 0, 'We should parse some rows'
        return column_name_list, row_list   

    def __parse_column_name_list(self, stmt_date_list):
        column_name_list = [u'會計科目']
        assert len(stmt_date_list) == 2, 'There shouble be 2 statement dates' 
        for i in [0, 1]:
            date_period = self.string_utils.from_local_string_to_date_period(stmt_date_list[i].get_value())
            stmt_date = date_period[1]
            column_name_list.append(stmt_date)  
        return column_name_list        

    # ['TK_ACCOUNT', 'TK_EOL']
    def __parse_account_line(self, line):
        return [line[0].get_value()]

    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_number_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()), 
            self.string_utils.normalize_number(line[2].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']
    def __parse_account_paren_number_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            self.string_utils.normalize_number(line[4].get_value())
        ]
        
    # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            self.string_utils.normalize_number(line[1].get_value()),
            -self.string_utils.normalize_number(line[3].get_value())
        ]

    # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']
    def __parse_account_paren_number_paren_number_line(self, line):
        return [
            line[0].get_value(),
            -self.string_utils.normalize_number(line[2].get_value()),
            -self.string_utils.normalize_number(line[5].get_value())
        ]
Example #17
0
 def __init__(self, text):
     self.text = text
     self.head_splitted_account = None
     self.account_utils = AccountUtils()
     self.string_utils = StringUtils()
Example #18
0
 def setUp(self):
     self.string_utils = StringUtils()
Example #19
0
 def __init__(self):
     self.base_xpath = '//html/body/center'
     self.date_utils = DateUtils()
     self.string_utils = StringUtils()
Example #20
0
 def __init__(self):
     self.storage = SpiderStorage()
     self.string_utils = StringUtils()
 def __init__(self):
     self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]'
     self.string_utils = StringUtils()
Example #22
0
class GeminiParser:
    def __init__(self):
        self.base_xpath = "//html/body/center"
        self.date_utils = DateUtils()
        self.string_utils = StringUtils()

    def parse(self, content):
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__parse_column_name_list(relative_html_object)
        row_list = self.__parse_row_list(relative_html_object)
        release_date = self.__parse_release_date(relative_html_object)
        return column_name_list, row_list, release_date

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u"<br>", u"")
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, "invalid base_xpath"
        return relative_html_object_list[0]

    def __parse_column_name_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        tr_tags = inner_table_tags[0].xpath("./tr")
        assert len(tr_tags) > 1, "invalid tr_tags"

        th_texts = tr_tags[1].xpath("./th/text()")
        return th_texts

    def __parse_row_list(self, relative_html_object):
        # traverse and sanity check
        table_tags = relative_html_object.xpath("./table")
        assert len(table_tags) > 1, "invalid table_tags"

        # skip first table of description about IFRS
        inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table")
        assert len(inner_table_tags) > 0, "invalid inner_table_tags"

        all_tr_tags = []
        # every inner_table represents an industry
        for inner_table_tag in inner_table_tags:
            tr_tags = inner_table_tag.xpath("./tr")
            assert len(tr_tags) > 2, "invalid tr_tags"
            # first two rows are headers
            # last row is u'合計'
            all_tr_tags += tr_tags[2:-1]
        return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags]

    def __parse_row(self, relative_html_object):
        td_texts = relative_html_object.xpath("./td/text()")
        assert len(td_texts) == 10, "invalid td_texts size, should be 10"

        items = td_texts[:2]

        numbers = []
        for td_text in td_texts[2:]:
            number = self.string_utils.normalize_number(td_text)
            numbers.append(number)

        return items + numbers

    def __parse_release_date(self, relative_html_object):
        div_tags = relative_html_object.xpath("./div")
        assert len(div_tags) > 0, "invalid div_tags"

        groups = self.string_utils.match(u"^出表日期:(.*)$", div_tags[-1].text.strip())
        assert len(groups) > 0, "could not match ^出表日期:(.*)$"

        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date