class StockPriceAssembler():
    def __init__(self):
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        lines = content.splitlines()
        column_name_list = self.__assemble_column_name_list(lines)
        row_list = self.__assemble_row_list(lines)
        return StockPriceDao(column_name_list, row_list, stock_symbol, 'D')

    def __assemble_column_name_list(self, lines):
        return lines[0].split(',')

    def __assemble_row_list(self, lines):
        return [self.__assemble_row(line) for line in lines[1:]]

    def __assemble_row(self, line):
        texts = line.split(',')

        row = []
        stmt_date = self.string_utils.from_local_string_to_date(texts[0])
        row.append(stmt_date)
        for number_string in texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)
        return row
class StockPriceAssembler():
    def __init__(self):
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        lines = content.splitlines()
        column_name_list = self.__assemble_column_name_list(lines)
        row_list = self.__assemble_row_list(lines)
        return StockPriceDao(column_name_list, row_list, stock_symbol, 'D')

    def __assemble_column_name_list(self, lines):
        return lines[0].split(',')

    def __assemble_row_list(self, lines):
        return [self.__assemble_row(line) for line in lines[1:]]

    def __assemble_row(self, line):
        texts = line.split(',')

        row = []
        stmt_date = self.string_utils.from_local_string_to_date(texts[0])
        row.append(stmt_date)
        for number_string in texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)
        return row
class OperatingRevenueAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return OperatingRevenueDao(column_name_list, row_list, stock_symbol, 'M')

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'
                
        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 5, 'invalid tr_tags'

        td_texts = tr_tags[5].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 5, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[6:]]

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'
        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number 
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Exemple #4
0
class FinancialStatementAssembler():
    def __init__(self, base_xpath_param, column_name_pos=0):
        self.base_xpath = self.__init_base_xpath(base_xpath_param)
        self.column_name_pos = column_name_pos
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()

    def __init_base_xpath(self, base_xpath_param):
        if base_xpath_param == 'basic':
            return '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
        elif base_xpath_param == 'form':
            return '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table'

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(
            html_object)
        column_name_list = self.__assemble_column_name_list(
            relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return {
            'stock_symbol': stock_symbol,
            'column_name_list': column_name_list,
            'row_list': row_list,
        }

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(
            relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > self.column_name_pos, 'invalid tr_tags'

        td_texts = tr_tags[self.column_name_pos].xpath('./td/text()')

        # the first entry should be account
        column_name_list = [td_texts[0]]

        # the rest should be stmt_date
        for text in td_texts[1:]:
            stmt_date = self.string_utils.from_local_string_to_date(text)
            column_name_list.append(stmt_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > self.column_name_pos + 1, 'invalid tr_tags'

        return [
            self.__assemble_row(tr_tag)
            for tr_tag in tr_tags[self.column_name_pos + 1:]
        ]

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)

        # return empty row if empty
        if not td_texts:
            return []

        # the first entry should be account
        row = [td_texts[0].strip()]

        # the rest should be stmt_date
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Exemple #5
0
 def __init__(self, base_xpath_param, column_name_pos=0):
     self.base_xpath = self.__init_base_xpath(base_xpath_param)
     self.column_name_pos = column_name_pos
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
     self.lxml_utils = LxmlUtils()
Exemple #6
0
class DividendPolicyAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(
            html_object)
        column_name_list = self.__assemble_column_name_list(
            relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return DividendPolicyDao(column_name_list, row_list, stock_symbol, 'Y')

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(
            relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        # skip the first row of header
        td_texts = tr_tags[2].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        # skip the first row of header
        # skip the second row of empty lines
        # skip the third row of column name list
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]]

    def __assemble_row(self, relative_html_object):
        # should be item
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number
        for number_string in td_texts[1:-1]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        # should be number in percentage
        for number_string in td_texts[-1:]:
            number = self.string_utils.normalize_number(number_string) * 0.01
            row.append(number)

        return row
 def setUp(self):
     self.string_utils = StringUtils()
class DividendPolicyAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return DividendPolicyDao(column_name_list, row_list, stock_symbol, 'Y')

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        # skip the first row of header
        td_texts = tr_tags[2].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        # skip the first row of header
        # skip the second row of empty lines
        # skip the third row of column name list
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 2, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]]

    def __assemble_row(self, relative_html_object):
        # should be item
        td_texts = relative_html_object.xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number 
        for number_string in td_texts[1:-1]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        # should be number in percentage
        for number_string in td_texts[-1:]:
            number = self.string_utils.normalize_number(number_string) * 0.01
            row.append(number)

        return row
Exemple #9
0
 def __init__(self):
     self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table'
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
     self.lxml_utils = LxmlUtils()
 def setUp(self):
     self.string_utils = StringUtils()
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number('33,825,315')
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('0')
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('-115,859,592')
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('(27,540)')
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('2.85')
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('170,270,395.00')
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('(  10,117,111)')
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u'-')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'不適用')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'N/A')
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u'20.92%')
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u'2013年12月31日')
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'2012年01月01日')
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date('1962/02/09')
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'2015/08/13')
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'民國103年09月')
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'104')
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'104.2Q')
        expected = datetime.date(2015, 6, 30)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u'99年09月30日')
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'102/05/07')
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(
            datetime.date(2001, 1, 1))
        expected = '90'
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(
            datetime.date(2001, 1, 1))
        expected = '01'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(
            datetime.date(2001, 10, 31))
        expected = '10'
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 1, 1))
        expected = '01'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 3, 31))
        expected = '01'
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 4, 1))
        expected = '02'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 6, 30))
        expected = '02'
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 7, 1))
        expected = '03'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 9, 30))
        expected = '03'
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 10, 1))
        expected = '04'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(
            datetime.date(2001, 12, 31))
        expected = '04'
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 1, 1))
        expected = '1'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 3, 31))
        expected = '1'
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 4, 1))
        expected = '2'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 6, 30))
        expected = '2'
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 7, 1))
        expected = '3'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 9, 30))
        expected = '3'
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 10, 1))
        expected = '4'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(
            datetime.date(2001, 12, 31))
        expected = '4'
        self.assertEqual(actual, expected)

    def test_match_account(self):
        pattern = u'^([^\s]*):$'
        actual = self.string_utils.match(pattern, u'營業活動之現金流量:')
        expected = [u'營業活動之現金流量']
        self.assertEqual(actual, expected)
class BalanceSheetSummaryAssembler():
    def __init__(self, period):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()
        self.period = period

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(
            html_object)
        column_name_list = self.__assemble_column_name_list(
            relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        short_period = None
        if self.period == 'quarterly':
            short_period = 'Q'
        elif self.period == 'yearly':
            short_period = 'Y'
        return BalanceSheetSummaryDao(column_name_list, row_list, stock_symbol,
                                      short_period)

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(
            relative_html_object_list) > 1, 'invalid base_xpath (table_tags)'
        if self.period == 'quarterly':
            return relative_html_object_list[0]
        elif self.period == 'yearly':
            return relative_html_object_list[1]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        td_texts = tr_tags[0].xpath('./td/text()')

        # the first entry should be account
        column_name_list = [td_texts[0]]

        # the rest should be stmt_date
        for text in td_texts[1:]:
            stmt_date = self.string_utils.from_local_string_to_date(text)
            column_name_list.append(stmt_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 1, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[1:]]

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)

        # the first entry should be account
        row = [td_texts[0]]

        # the rest should be stmt_date
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Exemple #13
0
 def __init__(self):
     self.storage = SpiderStorage()
     self.string_utils = StringUtils()
 def __init__(self):
     self.string_utils = StringUtils()
 def __init__(self):
     self.string_utils = StringUtils()
Exemple #16
0
class OperatingRevenueAssembler():
    def __init__(self):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(
            html_object)
        column_name_list = self.__assemble_column_name_list(
            relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return OperatingRevenueDao(column_name_list, row_list, stock_symbol,
                                   'M')

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(
            relative_html_object_list) > 0, 'invalid base_xpath (table_tags)'

        return relative_html_object_list[0]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 5, 'invalid tr_tags'

        td_texts = tr_tags[5].xpath('./td/text()')
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'

        return [text.strip() for text in td_texts]

    def __assemble_row_list(self, relative_html_object):
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 5, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[6:]]

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)
        assert len(td_texts) == 7, 'invalid td_texts size, should be 7'
        row = []

        # should be stmt_date
        stmt_date = self.string_utils.from_local_string_to_date(td_texts[0])
        row.append(stmt_date)

        # should be number
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
Exemple #17
0
class StockSymbolAssembler():
    def __init__(self):
        self.base_xpath = '//html/body'
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()

    def assemble(self, param):
        content = self.string_utils.normalize_string(param['content'])
        html_object = lxml.html.fromstring(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        release_date = self.__assemble_release_date(relative_html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        return StockSymbolDao(column_name_list, row_list, release_date)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) == 1, 'invalid base_xpath'
        return relative_html_object_list[0]

    def __assemble_release_date(self, relative_html_object):
        # try to get release date
        table_tags = relative_html_object.xpath('./table')
        assert len(table_tags) > 0, 'invalid table_tags'

        headline_tags = table_tags[0].xpath('./h2')
        assert len(headline_tags) > 0, 'invalid headline_tags'
        
        headline_texts = headline_tags[1].xpath('./strong/center')
        groups = self.string_utils.match(u'^最近更新日期:(.*)$', headline_texts[0].text.strip())
        assert len(groups) > 0, 'could not match ^最近更新日期:(.*)$'
        
        release_date = self.string_utils.from_local_string_to_date(groups[0])
        return release_date

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        # traverse and sanity check
        original_column_name_list = tr_tags[0].xpath('./td/text()')

        # handle the first column name: '有價證券代號及名稱'
        combined_column_name = original_column_name_list[0].strip() 
        assert combined_column_name == u'有價證券代號及名稱', 'should be 有價證券代號及名稱 in unicode'
        # the chinese character '及' means 'and' so we need to seperate this column name
        seperated_column_name_list = combined_column_name.split(u'及')
        assert len(seperated_column_name_list) == 2

        column_name_list = seperated_column_name_list + original_column_name_list[1:]
        assert len(column_name_list) == 8, 'invalid column_name_list size, should be 8'
        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        # skip one row of column name list
        tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')[1:]

        row_list = []
        for tr_tag in tr_tags:
            row = self.__assemble_row(tr_tag)
            # if there is only one cell '股票' in row, skip it
            if row:
                row_list.append(row)
        return row_list

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)

        # if there is only one cell '股票', return None
        if len(td_texts) == 1:
            return None

        # sanity check
        assert len(td_texts) == 7

        # handle the first cell: '有價證券代號及名稱'
        # it should be seperated as stock symbol and stock name
        combined_cell = td_texts[0].strip()
        seperated_cell_list = combined_cell.split()
        assert len(seperated_cell_list) == 2

        # convert to datetime.date type
        listing_date = self.string_utils.from_local_string_to_date(td_texts[2])

        row = seperated_cell_list + [td_texts[1]] + [listing_date] + td_texts[3:]
        return row
Exemple #18
0
 def __init__(self):
     self.base_xpath = '//html/body'
     self.string_utils = StringUtils()
     self.lxml_utils = LxmlUtils()
class BalanceSheetSummaryAssembler():
    def __init__(self, period):
        self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table/tr/td/table'
        self.content_screener = ContentScreener()
        self.string_utils = StringUtils()
        self.lxml_utils = LxmlUtils()
        self.period = period

    def assemble(self, param):
        content, stock_symbol = param['content'], param['stock_symbol']
        self.content_screener.screen(param)
        html_object = self.__get_html_object(content)
        relative_html_object = self.__traverse_to_relative_html_object(html_object)
        column_name_list = self.__assemble_column_name_list(relative_html_object)
        row_list = self.__assemble_row_list(relative_html_object)
        short_period = None
        if self.period == 'quarterly':
            short_period = 'Q'
        elif self.period == 'yearly':
            short_period = 'Y'
        return BalanceSheetSummaryDao(column_name_list, row_list, stock_symbol, short_period)

    def __get_html_object(self, content):
        content = self.string_utils.normalize_string(content)
        content = content.replace(u'<br>', u'')
        return lxml.html.fromstring(content)

    def __traverse_to_relative_html_object(self, html_object):
        relative_html_object_list = html_object.xpath(self.base_xpath)
        assert len(relative_html_object_list) > 1, 'invalid base_xpath (table_tags)'
        if self.period == 'quarterly':
            return relative_html_object_list[0]
        elif self.period == 'yearly':
            return relative_html_object_list[1]

    def __assemble_column_name_list(self, relative_html_object):
        # traverse and sanity check
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 0, 'invalid tr_tags'

        td_texts = tr_tags[0].xpath('./td/text()')

        # the first entry should be account
        column_name_list = [td_texts[0]]

        # the rest should be stmt_date
        for text in td_texts[1:]:
            stmt_date = self.string_utils.from_local_string_to_date(text)
            column_name_list.append(stmt_date)

        return column_name_list

    def __assemble_row_list(self, relative_html_object):
        tr_tags = relative_html_object.xpath('./tr')
        assert len(tr_tags) > 1, 'invalid tr_tags'

        return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[1:]]

    def __assemble_row(self, relative_html_object):
        td_tags = relative_html_object.xpath('./td')
        td_texts = self.lxml_utils.get_text_list(td_tags)

        # the first entry should be account
        row = [td_texts[0]]

        # the rest should be stmt_date
        for number_string in td_texts[1:]:
            number = self.string_utils.normalize_number(number_string)
            row.append(number)

        return row
 def __init__(self):
     self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table'
     self.content_screener = ContentScreener()
     self.string_utils = StringUtils()
class StringUtilsTest(unittest.TestCase):
    def setUp(self):
        self.string_utils = StringUtils()

    def tearDown(self):
        self.string_utils = None

    def test_normalize_arabic_number(self):
        actual = self.string_utils.normalize_number('33,825,315')
        expected = 33825315
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('0')
        expected = 0
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('-115,859,592')
        expected = -115859592
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('(27,540)')
        expected = -27540
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('2.85')
        expected = 2.85
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('170,270,395.00')
        expected = 170270395
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number('(  10,117,111)')
        expected = -10117111
        self.assertEqual(actual, expected)

    def test_normalize_none_number(self):
        actual = self.string_utils.normalize_number(u'-')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'不適用')
        expected = None
        self.assertEqual(actual, expected)

        actual = self.string_utils.normalize_number(u'N/A')
        expected = None
        self.assertEqual(actual, expected)

    def test_normalize_percentage(self):
        actual = self.string_utils.normalize_number(u'20.92%')
        expected = 0.2092
        self.assertAlmostEqual(actual, expected)

    def test_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u'2013年12月31日')
        expected = datetime.date(2013, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'2012年01月01日')
        expected = datetime.date(2012, 1, 1)
        self.assertEqual(actual, expected)
        
        actual = self.string_utils.from_local_string_to_date('1962/02/09')
        expected = datetime.date(1962, 2, 9)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'2015/08/13')
        expected = datetime.date(2015, 8, 13)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'民國103年09月')
        expected = datetime.date(2014, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'104')
        expected = datetime.date(2015, 12, 31)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'104.2Q')
        expected = datetime.date(2015, 6, 30)
        self.assertEqual(actual, expected)

    def test_roc_era_from_local_string_to_date(self):
        actual = self.string_utils.from_local_string_to_date(u'99年09月30日')
        expected = datetime.date(2010, 9, 30)
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_local_string_to_date(u'102/05/07')
        expected = datetime.date(2013, 5, 7)
        self.assertEqual(actual, expected)

    def test_from_date_to_roc_era_string(self):
        actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1))
        expected = '90'
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_month_string(self):
        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1))
        expected = '01'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31))
        expected = '10'
        self.assertEqual(actual, expected)

    def test_from_date_to_2_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = '01'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = '01'
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = '02'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = '02'
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = '03'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = '03'
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = '04'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = '04'
        self.assertEqual(actual, expected)

    def test_from_date_to_1_digit_quarter_string(self):
        # spring
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1))
        expected = '1'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31))
        expected = '1'
        self.assertEqual(actual, expected)

        # summer
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1))
        expected = '2'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30))
        expected = '2'
        self.assertEqual(actual, expected)

        # fall
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1))
        expected = '3'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30))
        expected = '3'
        self.assertEqual(actual, expected)

        # winter
        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1))
        expected = '4'
        self.assertEqual(actual, expected)

        actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31))
        expected = '4'
        self.assertEqual(actual, expected)

    def test_match_account(self):
        pattern = u'^([^\s]*):$'
        actual = self.string_utils.match(pattern, u'營業活動之現金流量:')
        expected = [u'營業活動之現金流量']
        self.assertEqual(actual, expected)