class StockPriceAssembler(): def __init__(self): self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] lines = content.splitlines() column_name_list = self.__assemble_column_name_list(lines) row_list = self.__assemble_row_list(lines) return StockPriceDao(column_name_list, row_list, stock_symbol, 'D') def __assemble_column_name_list(self, lines): return lines[0].split(',') def __assemble_row_list(self, lines): return [self.__assemble_row(line) for line in lines[1:]] def __assemble_row(self, line): texts = line.split(',') row = [] stmt_date = self.string_utils.from_local_string_to_date(texts[0]) row.append(stmt_date) for number_string in texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
class OperatingRevenueAssembler(): def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return OperatingRevenueDao(column_name_list, row_list, stock_symbol, 'M') def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 5, 'invalid tr_tags' td_texts = tr_tags[5].xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' return [text.strip() for text in td_texts] def __assemble_row_list(self, relative_html_object): tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 5, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[6:]] def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) assert len(td_texts) == 7, 'invalid td_texts size, should be 7' row = [] # should be stmt_date stmt_date = self.string_utils.from_local_string_to_date(td_texts[0]) row.append(stmt_date) # should be number for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
class FinancialStatementAssembler(): def __init__(self, base_xpath_param, column_name_pos=0): self.base_xpath = self.__init_base_xpath(base_xpath_param) self.column_name_pos = column_name_pos self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() def __init_base_xpath(self, base_xpath_param): if base_xpath_param == 'basic': return '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' elif base_xpath_param == 'form': return '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table' def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object( html_object) column_name_list = self.__assemble_column_name_list( relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return { 'stock_symbol': stock_symbol, 'column_name_list': column_name_list, 'row_list': row_list, } def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len( relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > self.column_name_pos, 'invalid tr_tags' td_texts = tr_tags[self.column_name_pos].xpath('./td/text()') # the first entry should be account column_name_list = [td_texts[0]] # the rest should be stmt_date for text in td_texts[1:]: stmt_date = self.string_utils.from_local_string_to_date(text) column_name_list.append(stmt_date) return column_name_list def __assemble_row_list(self, relative_html_object): tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > self.column_name_pos + 1, 'invalid tr_tags' return [ self.__assemble_row(tr_tag) for tr_tag in tr_tags[self.column_name_pos + 1:] ] def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) # return empty row if empty if not td_texts: return [] # the first entry should be account row = [td_texts[0].strip()] # the rest should be stmt_date for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
def __init__(self, base_xpath_param, column_name_pos=0): self.base_xpath = self.__init_base_xpath(base_xpath_param) self.column_name_pos = column_name_pos self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils()
class DividendPolicyAssembler(): def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object( html_object) column_name_list = self.__assemble_column_name_list( relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return DividendPolicyDao(column_name_list, row_list, stock_symbol, 'Y') def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len( relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' # skip the first row of header td_texts = tr_tags[2].xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' return [text.strip() for text in td_texts] def __assemble_row_list(self, relative_html_object): # skip the first row of header # skip the second row of empty lines # skip the third row of column name list tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]] def __assemble_row(self, relative_html_object): # should be item td_texts = relative_html_object.xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' row = [] # should be stmt_date stmt_date = self.string_utils.from_local_string_to_date(td_texts[0]) row.append(stmt_date) # should be number for number_string in td_texts[1:-1]: number = self.string_utils.normalize_number(number_string) row.append(number) # should be number in percentage for number_string in td_texts[-1:]: number = self.string_utils.normalize_number(number_string) * 0.01 row.append(number) return row
def setUp(self): self.string_utils = StringUtils()
class DividendPolicyAssembler(): def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return DividendPolicyDao(column_name_list, row_list, stock_symbol, 'Y') def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' # skip the first row of header td_texts = tr_tags[2].xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' return [text.strip() for text in td_texts] def __assemble_row_list(self, relative_html_object): # skip the first row of header # skip the second row of empty lines # skip the third row of column name list tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]] def __assemble_row(self, relative_html_object): # should be item td_texts = relative_html_object.xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' row = [] # should be stmt_date stmt_date = self.string_utils.from_local_string_to_date(td_texts[0]) row.append(stmt_date) # should be number for number_string in td_texts[1:-1]: number = self.string_utils.normalize_number(number_string) row.append(number) # should be number in percentage for number_string in td_texts[-1:]: number = self.string_utils.normalize_number(number_string) * 0.01 row.append(number) return row
def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils()
class StringUtilsTest(unittest.TestCase): def setUp(self): self.string_utils = StringUtils() def tearDown(self): self.string_utils = None def test_normalize_arabic_number(self): actual = self.string_utils.normalize_number('33,825,315') expected = 33825315 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('0') expected = 0 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('-115,859,592') expected = -115859592 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('(27,540)') expected = -27540 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('2.85') expected = 2.85 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('170,270,395.00') expected = 170270395 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('( 10,117,111)') expected = -10117111 self.assertEqual(actual, expected) def test_normalize_none_number(self): actual = self.string_utils.normalize_number(u'-') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'不適用') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'N/A') expected = None self.assertEqual(actual, expected) def test_normalize_percentage(self): actual = self.string_utils.normalize_number(u'20.92%') expected = 0.2092 self.assertAlmostEqual(actual, expected) def test_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u'2013年12月31日') expected = datetime.date(2013, 12, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'2012年01月01日') expected = datetime.date(2012, 1, 1) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date('1962/02/09') expected = datetime.date(1962, 2, 9) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'2015/08/13') expected = datetime.date(2015, 8, 13) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'民國103年09月') expected = datetime.date(2014, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'104') expected = datetime.date(2015, 12, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'104.2Q') expected = datetime.date(2015, 6, 30) self.assertEqual(actual, expected) def test_roc_era_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u'99年09月30日') expected = datetime.date(2010, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'102/05/07') expected = datetime.date(2013, 5, 7) self.assertEqual(actual, expected) def test_from_date_to_roc_era_string(self): actual = self.string_utils.from_date_to_roc_era_string( datetime.date(2001, 1, 1)) expected = '90' self.assertEqual(actual, expected) def test_from_date_to_2_digit_month_string(self): actual = self.string_utils.from_date_to_2_digit_month_string( datetime.date(2001, 1, 1)) expected = '01' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_month_string( datetime.date(2001, 10, 31)) expected = '10' self.assertEqual(actual, expected) def test_from_date_to_2_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 1, 1)) expected = '01' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 3, 31)) expected = '01' self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 4, 1)) expected = '02' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 6, 30)) expected = '02' self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 7, 1)) expected = '03' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 9, 30)) expected = '03' self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 10, 1)) expected = '04' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string( datetime.date(2001, 12, 31)) expected = '04' self.assertEqual(actual, expected) def test_from_date_to_1_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 1, 1)) expected = '1' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 3, 31)) expected = '1' self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 4, 1)) expected = '2' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 6, 30)) expected = '2' self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 7, 1)) expected = '3' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 9, 30)) expected = '3' self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 10, 1)) expected = '4' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string( datetime.date(2001, 12, 31)) expected = '4' self.assertEqual(actual, expected) def test_match_account(self): pattern = u'^([^\s]*):$' actual = self.string_utils.match(pattern, u'營業活動之現金流量:') expected = [u'營業活動之現金流量'] self.assertEqual(actual, expected)
class BalanceSheetSummaryAssembler(): def __init__(self, period): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() self.period = period def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object( html_object) column_name_list = self.__assemble_column_name_list( relative_html_object) row_list = self.__assemble_row_list(relative_html_object) short_period = None if self.period == 'quarterly': short_period = 'Q' elif self.period == 'yearly': short_period = 'Y' return BalanceSheetSummaryDao(column_name_list, row_list, stock_symbol, short_period) def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len( relative_html_object_list) > 1, 'invalid base_xpath (table_tags)' if self.period == 'quarterly': return relative_html_object_list[0] elif self.period == 'yearly': return relative_html_object_list[1] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 0, 'invalid tr_tags' td_texts = tr_tags[0].xpath('./td/text()') # the first entry should be account column_name_list = [td_texts[0]] # the rest should be stmt_date for text in td_texts[1:]: stmt_date = self.string_utils.from_local_string_to_date(text) column_name_list.append(stmt_date) return column_name_list def __assemble_row_list(self, relative_html_object): tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 1, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[1:]] def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) # the first entry should be account row = [td_texts[0]] # the rest should be stmt_date for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
def __init__(self): self.storage = SpiderStorage() self.string_utils = StringUtils()
def __init__(self): self.string_utils = StringUtils()
class OperatingRevenueAssembler(): def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/form/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object( html_object) column_name_list = self.__assemble_column_name_list( relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return OperatingRevenueDao(column_name_list, row_list, stock_symbol, 'M') def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len( relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 5, 'invalid tr_tags' td_texts = tr_tags[5].xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' return [text.strip() for text in td_texts] def __assemble_row_list(self, relative_html_object): tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 5, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[6:]] def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) assert len(td_texts) == 7, 'invalid td_texts size, should be 7' row = [] # should be stmt_date stmt_date = self.string_utils.from_local_string_to_date(td_texts[0]) row.append(stmt_date) # should be number for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
class StockSymbolAssembler(): def __init__(self): self.base_xpath = '//html/body' self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() def assemble(self, param): content = self.string_utils.normalize_string(param['content']) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) release_date = self.__assemble_release_date(relative_html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return StockSymbolDao(column_name_list, row_list, release_date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) == 1, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_release_date(self, relative_html_object): # try to get release date table_tags = relative_html_object.xpath('./table') assert len(table_tags) > 0, 'invalid table_tags' headline_tags = table_tags[0].xpath('./h2') assert len(headline_tags) > 0, 'invalid headline_tags' headline_texts = headline_tags[1].xpath('./strong/center') groups = self.string_utils.match(u'^最近更新日期:(.*)$', headline_texts[0].text.strip()) assert len(groups) > 0, 'could not match ^最近更新日期:(.*)$' release_date = self.string_utils.from_local_string_to_date(groups[0]) return release_date def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr') assert len(tr_tags) > 0, 'invalid tr_tags' # traverse and sanity check original_column_name_list = tr_tags[0].xpath('./td/text()') # handle the first column name: '有價證券代號及名稱' combined_column_name = original_column_name_list[0].strip() assert combined_column_name == u'有價證券代號及名稱', 'should be 有價證券代號及名稱 in unicode' # the chinese character '及' means 'and' so we need to seperate this column name seperated_column_name_list = combined_column_name.split(u'及') assert len(seperated_column_name_list) == 2 column_name_list = seperated_column_name_list + original_column_name_list[1:] assert len(column_name_list) == 8, 'invalid column_name_list size, should be 8' return column_name_list def __assemble_row_list(self, relative_html_object): # skip one row of column name list tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')[1:] row_list = [] for tr_tag in tr_tags: row = self.__assemble_row(tr_tag) # if there is only one cell '股票' in row, skip it if row: row_list.append(row) return row_list def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) # if there is only one cell '股票', return None if len(td_texts) == 1: return None # sanity check assert len(td_texts) == 7 # handle the first cell: '有價證券代號及名稱' # it should be seperated as stock symbol and stock name combined_cell = td_texts[0].strip() seperated_cell_list = combined_cell.split() assert len(seperated_cell_list) == 2 # convert to datetime.date type listing_date = self.string_utils.from_local_string_to_date(td_texts[2]) row = seperated_cell_list + [td_texts[1]] + [listing_date] + td_texts[3:] return row
def __init__(self): self.base_xpath = '//html/body' self.string_utils = StringUtils() self.lxml_utils = LxmlUtils()
class BalanceSheetSummaryAssembler(): def __init__(self, period): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() self.lxml_utils = LxmlUtils() self.period = period def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) short_period = None if self.period == 'quarterly': short_period = 'Q' elif self.period == 'yearly': short_period = 'Y' return BalanceSheetSummaryDao(column_name_list, row_list, stock_symbol, short_period) def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 1, 'invalid base_xpath (table_tags)' if self.period == 'quarterly': return relative_html_object_list[0] elif self.period == 'yearly': return relative_html_object_list[1] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 0, 'invalid tr_tags' td_texts = tr_tags[0].xpath('./td/text()') # the first entry should be account column_name_list = [td_texts[0]] # the rest should be stmt_date for text in td_texts[1:]: stmt_date = self.string_utils.from_local_string_to_date(text) column_name_list.append(stmt_date) return column_name_list def __assemble_row_list(self, relative_html_object): tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 1, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[1:]] def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') td_texts = self.lxml_utils.get_text_list(td_tags) # the first entry should be account row = [td_texts[0]] # the rest should be stmt_date for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils()
class StringUtilsTest(unittest.TestCase): def setUp(self): self.string_utils = StringUtils() def tearDown(self): self.string_utils = None def test_normalize_arabic_number(self): actual = self.string_utils.normalize_number('33,825,315') expected = 33825315 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('0') expected = 0 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('-115,859,592') expected = -115859592 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('(27,540)') expected = -27540 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('2.85') expected = 2.85 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('170,270,395.00') expected = 170270395 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number('( 10,117,111)') expected = -10117111 self.assertEqual(actual, expected) def test_normalize_none_number(self): actual = self.string_utils.normalize_number(u'-') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'不適用') expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u'N/A') expected = None self.assertEqual(actual, expected) def test_normalize_percentage(self): actual = self.string_utils.normalize_number(u'20.92%') expected = 0.2092 self.assertAlmostEqual(actual, expected) def test_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u'2013年12月31日') expected = datetime.date(2013, 12, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'2012年01月01日') expected = datetime.date(2012, 1, 1) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date('1962/02/09') expected = datetime.date(1962, 2, 9) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'2015/08/13') expected = datetime.date(2015, 8, 13) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'民國103年09月') expected = datetime.date(2014, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'104') expected = datetime.date(2015, 12, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'104.2Q') expected = datetime.date(2015, 6, 30) self.assertEqual(actual, expected) def test_roc_era_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u'99年09月30日') expected = datetime.date(2010, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u'102/05/07') expected = datetime.date(2013, 5, 7) self.assertEqual(actual, expected) def test_from_date_to_roc_era_string(self): actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1)) expected = '90' self.assertEqual(actual, expected) def test_from_date_to_2_digit_month_string(self): actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1)) expected = '01' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31)) expected = '10' self.assertEqual(actual, expected) def test_from_date_to_2_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1)) expected = '01' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31)) expected = '01' self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1)) expected = '02' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30)) expected = '02' self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1)) expected = '03' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30)) expected = '03' self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1)) expected = '04' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31)) expected = '04' self.assertEqual(actual, expected) def test_from_date_to_1_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1)) expected = '1' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31)) expected = '1' self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1)) expected = '2' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30)) expected = '2' self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1)) expected = '3' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30)) expected = '3' self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1)) expected = '4' self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31)) expected = '4' self.assertEqual(actual, expected) def test_match_account(self): pattern = u'^([^\s]*):$' actual = self.string_utils.match(pattern, u'營業活動之現金流量:') expected = [u'營業活動之現金流量'] self.assertEqual(actual, expected)