class IfrsOperatingRevenueAssembler(): def __init__(self): self.base_xpath = '//html/body' self.content_screener = ContentScreener() self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol, date = param['content'], param['stock_symbol'], param['date'] self.content_screener.screen(param) content = self.string_utils.normalize_string(content) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return OperatingRevenueDao(column_name_list, row_list, stock_symbol, date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr') assert len(tr_tags) > 0, 'invalid tr_tags' # traverse and sanity check th_texts = tr_tags[0].xpath('./th/text()') assert len(th_texts) == 2, 'invalid th_texts size, should be 2' # should be account account = th_texts[0] # traverse and sanity check table_tags = relative_html_object.xpath('./table[@class="noBorder"]') assert len(table_tags) > 0, 'invalid table_tags' td_tags = table_tags[2].xpath('./td') assert len(td_tags) > 0, 'invalid td_tags' # should be snapdate snapdate = self.string_utils.from_local_string_to_date(td_tags[1].text) return [account, snapdate] def __assemble_row_list(self, relative_html_object): # skip one row of column name list tr_tags = relative_html_object.xpath('./table[@class="hasBorder"]/tr')[1:] return [self.__assemble_row(tr_tag) for tr_tag in tr_tags] def __assemble_row(self, relative_html_object): # should be item th_texts = relative_html_object.xpath('./th/text()') assert len(th_texts) == 1, 'invalid th_texts size, should be 1' item = th_texts[0] # should be number (operating revenue) td_texts = relative_html_object.xpath('./td/text()') assert len(th_texts) == 1, 'invalid td_texts size, should be 1' number_string = td_texts[0] number = self.string_utils.normalize_number(number_string) return [item, number]
class Spider(): def __init__(self): self.storage = SpiderStorage() self.string_utils = StringUtils() def crawl(self, param): param = self.__extend_param(param) url = self.build_url(param) key = self.build_key(param) self.storage.set(key, url) def is_crawled(self, param): param = self.__extend_param(param) key = self.build_key(param) return self.storage.contains(key) def get_crawled(self, param): param = self.__extend_param(param) key = self.build_key(param) return self.storage.get(key) def __extend_param(self, param): output = {} if 'stock_symbol' in param: output['stock_symbol'] = param['stock_symbol'] if 'date' in param: date = param['date'] output['roc_era'] = self.string_utils.from_date_to_roc_era_string(date) output['year'] = str(date.year) output['quarter'] = self.string_utils.from_date_to_2_digit_quarter_string(date) output['quarter_xbrl'] = self.string_utils.from_date_to_1_digit_quarter_string(date) output['month'] = self.string_utils.from_date_to_2_digit_month_string(date) if 'market_type' in param: market_type = param['market_type'] output['market_type'] = market_type output['market_code'] = self.__extend_market_code(market_type) output['market_mode'] = self.__extend_market_mode(market_type) return output def __extend_market_code(self, market_type): code_map = { 'stock_exchange_market' : 'sii', 'otc_market' : 'otc', } return code_map[market_type] def __extend_market_mode(self, market_type): mode_map = { 'stock_exchange_market' : '2', 'otc_market' : '4', } return mode_map[market_type] def build_url(self, param): raise NotImplementedError def build_key(self, param): raise NotImplementedError
class LegacyCashFlowAssembler(): def __init__(self): self.base_xpath = '//html/body/table[@class="hasBorder"]/tr/td/pre' self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol, date = param['content'], param['stock_symbol'], param['date'] content = self.string_utils.normalize_string(content) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) # Parse cash flow statement column_name_list, row_list = self.__assemble_summary(relative_html_object.text) return CashFlowDao(column_name_list, row_list, stock_symbol, date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) == 1, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_summary(self, text): return AriesParser(text).parse()
def __init__(self): self.base_xpath = '//html/body' self.content_screener = ContentScreener() self.string_utils = StringUtils()
def __init__(self): self.base_xpath = '//html/body/center/table' self.string_utils = StringUtils()
class LegacyIncomeStatementAssembler(): def __init__(self): self.base_xpath = '//html/body/center/table' self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol, date = param['content'], param['stock_symbol'], param['date'] content = self.string_utils.normalize_string(content) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return IncomeStatementDao(column_name_list, row_list, stock_symbol, date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) == 1, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 0, 'invalid tr_tags' column_name_list = [] # should be account type column_th_texts = tr_tags[3].xpath('./th/b/text()') account_type = column_th_texts[0] # of unicode type column_name_list.append(account_type) for local_string in column_th_texts[1:]: # of datetime.date type snapshot_date = self.string_utils.from_local_string_to_date(local_string) column_name_list.append(snapshot_date) return column_name_list def __assemble_row_list(self, relative_html_object): # skip column part (5 rows) tr_tags = relative_html_object.xpath('./tr')[5:] return [self.__assemble_row(tr_tag) for tr_tag in tr_tags] def __assemble_row(self, relative_html_object): row = [] td_texts = relative_html_object.xpath('./td/text()') td_texts = self.__remove_empty_string_from_string_list(td_texts) # should be account type account_type = td_texts[0].strip() row.append(account_type) # should be number for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row def __remove_empty_string_from_string_list(self, string_list): return [string for string in string_list if string.strip()]
def __init__(self): self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="result_table hasBorder"]' self.content_screener = ContentScreener() self.string_utils = StringUtils()
def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils()
class DividendPolicyAssembler(): def __init__(self): self.base_xpath = '//html/body/div[@id="SysJustIFRAMEDIV"]/table/tr/td/table/tr/td/table' self.content_screener = ContentScreener() self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol = param['content'], param['stock_symbol'] self.content_screener.screen(param) html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return DividendPolicyDao(column_name_list, row_list, stock_symbol) def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath (table_tags)' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' # skip the first row of header td_texts = tr_tags[2].xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' return [text.strip() for text in td_texts] def __assemble_row_list(self, relative_html_object): # skip the first row of header # skip the second row of empty lines # skip the third row of column name list tr_tags = relative_html_object.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' return [self.__assemble_row(tr_tag) for tr_tag in tr_tags[3:]] def __assemble_row(self, relative_html_object): # should be item td_texts = relative_html_object.xpath('./td/text()') assert len(td_texts) == 7, 'invalid td_texts size, should be 7' row = [] # should be stmt_date stmt_date = self.string_utils.from_local_string_to_date(td_texts[0]) row.append(stmt_date) # should be number for number_string in td_texts[1:-1]: number = self.string_utils.normalize_number(number_string) row.append(number) # should be number in percentage for number_string in td_texts[-1:]: number = self.string_utils.normalize_number(number_string) * 0.01 row.append(number) return row
class StringUtilsTest(unittest.TestCase): def setUp(self): self.string_utils = StringUtils() def tearDown(self): self.string_utils = None def test_normalize_arabic_number(self): actual = self.string_utils.normalize_number("33,825,315") expected = 33825315 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("0") expected = 0 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("-115,859,592") expected = -115859592 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("(27,540)") expected = -27540 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("2.85") expected = 2.85 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("170,270,395.00") expected = 170270395 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number("( 10,117,111)") expected = -10117111 self.assertEqual(actual, expected) def test_normalize_none_number(self): actual = self.string_utils.normalize_number(u"-") expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u"") expected = None self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u"不適用") expected = None self.assertEqual(actual, expected) def test_normalize_chinese_number(self): actual = self.string_utils.normalize_number(u"九十九") expected = 99 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u"九十") expected = 90 self.assertEqual(actual, expected) actual = self.string_utils.normalize_number(u"三") expected = 3 self.assertEqual(actual, expected) def test_normalize_percentage(self): actual = self.string_utils.normalize_number(u"20.92%") expected = 0.2092 self.assertAlmostEqual(actual, expected) def test_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u"2013年12月31日") expected = datetime.date(2013, 12, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u"2012年01月01日") expected = datetime.date(2012, 1, 1) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date("1962/02/09") expected = datetime.date(1962, 2, 9) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u"2015/08/13") expected = datetime.date(2015, 8, 13) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u"民國103年09月") expected = datetime.date(2014, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u"104") expected = datetime.date(2015, 12, 31) self.assertEqual(actual, expected) def test_roc_era_from_local_string_to_date(self): actual = self.string_utils.from_local_string_to_date(u"99年09月30日") expected = datetime.date(2010, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date(u"102/05/07") expected = datetime.date(2013, 5, 7) self.assertEqual(actual, expected) def test_from_local_string_to_date_interval(self): actual = self.string_utils.from_local_string_to_date_period(u"2013年01月01日至2013年12月31日") expected = datetime.date(2013, 1, 1), datetime.date(2013, 12, 31) self.assertEqual(actual, expected) def test_roc_era_from_local_string_to_date_period(self): actual = self.string_utils.from_local_string_to_date_period(u"九十八年前三季") expected = datetime.date(2009, 1, 1), datetime.date(2009, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"九十八年第一季") expected = datetime.date(2009, 1, 1), datetime.date(2009, 3, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"100年第一季") expected = datetime.date(2011, 1, 1), datetime.date(2011, 3, 31) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"100年上半年度") expected = datetime.date(2011, 1, 1), datetime.date(2011, 6, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"99年上半年度") expected = datetime.date(2010, 1, 1), datetime.date(2010, 6, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"100年前三季") expected = datetime.date(2011, 1, 1), datetime.date(2011, 9, 30) self.assertEqual(actual, expected) actual = self.string_utils.from_local_string_to_date_period(u"100年度") expected = datetime.date(2011, 1, 1), datetime.date(2011, 12, 31) self.assertEqual(actual, expected) def test_from_date_to_roc_era_string(self): actual = self.string_utils.from_date_to_roc_era_string(datetime.date(2001, 1, 1)) expected = "90" self.assertEqual(actual, expected) def test_from_date_to_2_digit_month_string(self): actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 1, 1)) expected = "01" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_month_string(datetime.date(2001, 10, 31)) expected = "10" self.assertEqual(actual, expected) def test_from_date_to_2_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 1, 1)) expected = "01" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 3, 31)) expected = "01" self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 4, 1)) expected = "02" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 6, 30)) expected = "02" self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 7, 1)) expected = "03" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 9, 30)) expected = "03" self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 10, 1)) expected = "04" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_2_digit_quarter_string(datetime.date(2001, 12, 31)) expected = "04" self.assertEqual(actual, expected) def test_from_date_to_1_digit_quarter_string(self): # spring actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 1, 1)) expected = "1" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 3, 31)) expected = "1" self.assertEqual(actual, expected) # summer actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 4, 1)) expected = "2" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 6, 30)) expected = "2" self.assertEqual(actual, expected) # fall actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 7, 1)) expected = "3" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 9, 30)) expected = "3" self.assertEqual(actual, expected) # winter actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 10, 1)) expected = "4" self.assertEqual(actual, expected) actual = self.string_utils.from_date_to_1_digit_quarter_string(datetime.date(2001, 12, 31)) expected = "4" self.assertEqual(actual, expected) def test_is_match_seperation(self): pattern = u"^(-| |=)*$" self.assertTrue(self.string_utils.is_match(pattern, u"====== ======")) self.assertTrue(self.string_utils.is_match(pattern, u"------ ------")) self.assertFalse(self.string_utils.is_match(pattern, u"同時影響現金及非現金項目之投資活動:")) def test_match_account(self): pattern = u"^([^\s]*):$" actual = self.string_utils.match(pattern, u"營業活動之現金流量:") expected = [u"營業活動之現金流量"] self.assertEqual(actual, expected)
class XbrlIncomeStatementAssembler(): def __init__(self): self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]' self.string_utils = StringUtils() def assemble(self, param): content, stock_symbol, date = param['content'], param['stock_symbol'], param['date'] content = self.string_utils.normalize_string(content) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return IncomeStatementDao(column_name_list, row_list, stock_symbol, date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./tr[@class="tblHead"]') assert len(tr_tags) == 2, 'invalid tr_tags' # traverse and sanity check statement_th_texts = tr_tags[1].xpath('./th/text()') assert len(statement_th_texts) == 1, 'invalid statement_th_texts' assert unicode(statement_th_texts[0]) == u'綜合損益表', 'invalid statement_th_texts[0]' column_name_list = [] # should be account type column_th_texts = tr_tags[0].xpath('./th/text()') account_type = column_th_texts[0] # of unicode type column_name_list.append(account_type) # should be date interval for local_string in column_th_texts[1:]: # of (datetime.date, datetime.date) type snapshot_date = self.string_utils.from_local_string_to_date_period(local_string) column_name_list.append(snapshot_date) return column_name_list def __assemble_row_list(self, relative_html_object): # skip one row of statement name and one row of column name list tr_tags = relative_html_object.xpath('./tr')[2:] return [self.__assemble_row(tr_tag) for tr_tag in tr_tags] def __assemble_row(self, relative_html_object): row = [] td_texts = relative_html_object.xpath('./td/text()') # should be account type account_type = td_texts[0].strip() row.append(account_type) # should be number for number_string in td_texts[1:]: number = self.string_utils.normalize_number(number_string) row.append(number) return row
class TaurusParser(): def __init__(self): self.base_xpath = '//html/body/center' self.date_utils = DateUtils() self.string_utils = StringUtils() def parse(self, content): html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__parse_column_name_list(relative_html_object) row_list = self.__parse_row_list(relative_html_object) release_date = self.__parse_release_date(relative_html_object) return column_name_list, row_list, release_date def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u'<br>', u'') return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, 'invalid base_xpath' return relative_html_object_list[0] def __parse_column_name_list(self, relative_html_object): # traverse and sanity check table_tags = relative_html_object.xpath('./table') assert len(table_tags) > 1, 'invalid table_tags' # skip first table of description about IFRS inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table') assert len(inner_table_tags) > 0, 'invalid inner_table_tags' tr_tags = inner_table_tags[0].xpath('./tr') assert len(tr_tags) > 1, 'invalid tr_tags' th_texts = tr_tags[1].xpath('./th/text()') return th_texts def __parse_row_list(self, relative_html_object): # traverse and sanity check table_tags = relative_html_object.xpath('./table') assert len(table_tags) > 1, 'invalid table_tags' # skip first table of description about IFRS inner_table_tags = table_tags[1].xpath('./tr/td/table/tr/td/table') assert len(inner_table_tags) > 0, 'invalid inner_table_tags' all_tr_tags = [] # every inner_table represents an industry for inner_table_tag in inner_table_tags: tr_tags = inner_table_tag.xpath('./tr') assert len(tr_tags) > 2, 'invalid tr_tags' # first two rows are headers # last row is u'合計' all_tr_tags += tr_tags[2:-1] return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags] def __parse_row(self, relative_html_object): td_texts = relative_html_object.xpath('./td/text()') # record contains extra entry about comment assert len(td_texts) == 11, 'invalid td_texts size, should be 11' items = td_texts[:2] numbers = [] # skip the last entry about comment for td_text in td_texts[2:-1]: number = self.string_utils.normalize_number(td_text) numbers.append(number) return items + numbers def __parse_release_date(self, relative_html_object): div_tags = relative_html_object.xpath('./div') assert len(div_tags) > 0, 'invalid div_tags' groups = self.string_utils.match(u'^出表日期:(.*)$', div_tags[-1].text.strip()) assert len(groups) > 0, 'could not match ^出表日期:(.*)$' release_date = self.string_utils.from_local_string_to_date(groups[0]) return release_date
def __init__(self): self.base_xpath = '//html/body' self.string_utils = StringUtils()
class StockSymbolAssembler(): def __init__(self): self.base_xpath = '//html/body' self.string_utils = StringUtils() def assemble(self, param): content = self.string_utils.normalize_string(param['content']) html_object = lxml.html.fromstring(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) release_date = self.__assemble_release_date(relative_html_object) column_name_list = self.__assemble_column_name_list(relative_html_object) row_list = self.__assemble_row_list(relative_html_object) return StockSymbolDao(column_name_list, row_list, release_date) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) == 1, 'invalid base_xpath' return relative_html_object_list[0] def __assemble_release_date(self, relative_html_object): # try to get release date table_tags = relative_html_object.xpath('./table') assert len(table_tags) > 0, 'invalid table_tags' headline_tags = table_tags[0].xpath('./h2') assert len(headline_tags) > 0, 'invalid headline_tags' headline_texts = headline_tags[1].xpath('./strong/center') groups = self.string_utils.match(u'^最近更新日期:(.*)$', headline_texts[0].text.strip()) assert len(groups) > 0, 'could not match ^最近更新日期:(.*)$' release_date = self.string_utils.from_local_string_to_date(groups[0]) return release_date def __assemble_column_name_list(self, relative_html_object): # traverse and sanity check tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr') assert len(tr_tags) > 0, 'invalid tr_tags' # traverse and sanity check original_column_name_list = tr_tags[0].xpath('./td/text()') # handle the first column name: '有價證券代號及名稱' combined_column_name = original_column_name_list[0].strip() assert combined_column_name == u'有價證券代號及名稱', 'should be 有價證券代號及名稱 in unicode' # the chinese character '及' means 'and' so we need to seperate this column name seperated_column_name_list = combined_column_name.split(u'及') assert len(seperated_column_name_list) == 2 column_name_list = seperated_column_name_list + original_column_name_list[1:] assert len(column_name_list) == 8, 'invalid column_name_list size, should be 8' return column_name_list def __assemble_row_list(self, relative_html_object): # skip one row of column name list tr_tags = relative_html_object.xpath('./table[@class="h4"]/tr')[1:] row_list = [] for tr_tag in tr_tags: row = self.__assemble_row(tr_tag) # if there is only one cell '股票' in row, skip it if row: row_list.append(row) return row_list def __assemble_row(self, relative_html_object): td_tags = relative_html_object.xpath('./td') # we could not handle empty string between td tag if we use xpath './td/text()' # so we need to check each td.text one by one. td_texts = self.__get_lxml_text_list(td_tags) # if there is only one cell '股票', return None if len(td_texts) == 1: return None # sanity check assert len(td_texts) == 7 # handle the first cell: '有價證券代號及名稱' # it should be seperated as stock symbol and stock name combined_cell = td_texts[0].strip() seperated_cell_list = combined_cell.split() assert len(seperated_cell_list) == 2 # convert to datetime.date type listing_date = self.string_utils.from_local_string_to_date(td_texts[2]) row = seperated_cell_list + [td_texts[1]] + [listing_date] + td_texts[3:] return row def __get_lxml_text_list(self, tag_list): text_list = [] for tag in tag_list: if tag.text is None: text_list.append('') else: text_list.append(tag.text) return text_list
def __init__(self): self.base_xpath = '//html/body/table[@class="hasBorder"]/tr/td/pre' self.string_utils = StringUtils()
class AriesParser(): def __init__(self, text): self.text = text self.head_splitted_account = None self.account_utils = AccountUtils() self.string_utils = StringUtils() def parse(self): text = self.__preprocess_text(self.text) lines = self.__scan_lines(text) return self.__parse_lines(lines) def __preprocess_text(self, text): text = self.account_utils.concat_account(text) text = self.account_utils.remove_eten_separation(text) return text def __scan_lines(self, text): scanner = Scanner(Source(text)) scanner.scan() tokens = scanner.get_tokens() lines = [] tokens_in_line = [] for token in tokens: tokens_in_line.append(token) if token.get_token_type() == 'TK_EOL': lines.append(tokens_in_line) tokens_in_line = [] return lines def __parse_lines(self, lines): column_name_list = None visited_column_name_list = False row_list = [] for line in lines: type_list = [token.get_token_type() for token in line] #print type_list # pass useless line if type_list == ['TK_EOL']: continue elif type_list == ['TK_SEPERATION', 'TK_EOL']: continue # try to parse column name list if not visited_column_name_list: if type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']: try: column_name_list = self.__parse_column_name_list(line[:2]) visited_column_name_list = True except Exception: pass elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']: try: column_name_list = self.__parse_column_name_list(line[1:3]) visited_column_name_list = True except Exception: pass # try to parse rest row list else: row = None if type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL']: row = self.__parse_account_number_number_line(line) elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL']: row = self.__parse_account_paren_number_number_line(line) elif type_list == ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']: row = self.__parse_account_number_paren_number_line(line) elif type_list == ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL']: row = self.__parse_account_paren_number_paren_number_line(line) elif type_list == ['TK_ACCOUNT', 'TK_EOL']: row = self.__parse_account_line(line) elif type_list == ['TK_ACCOUNT', 'TK_ACCOUNT', 'TK_ACCOUNT', 'TK_EOL']: continue else: raise ValueError row_list.append(row) if row else None assert visited_column_name_list, 'We should parse column name list' assert len(row_list) > 0, 'We should parse some rows' return column_name_list, row_list def __parse_column_name_list(self, stmt_date_list): column_name_list = [u'會計科目'] assert len(stmt_date_list) == 2, 'There shouble be 2 statement dates' for i in [0, 1]: date_period = self.string_utils.from_local_string_to_date_period(stmt_date_list[i].get_value()) stmt_date = date_period[1] column_name_list.append(stmt_date) return column_name_list # ['TK_ACCOUNT', 'TK_EOL'] def __parse_account_line(self, line): return [line[0].get_value()] # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_NUMBER', 'TK_EOL'] def __parse_account_number_number_line(self, line): return [ line[0].get_value(), self.string_utils.normalize_number(line[1].get_value()), self.string_utils.normalize_number(line[2].get_value()) ] # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_NUMBER', 'TK_EOL'] def __parse_account_paren_number_number_line(self, line): return [ line[0].get_value(), -self.string_utils.normalize_number(line[2].get_value()), self.string_utils.normalize_number(line[4].get_value()) ] # ['TK_ACCOUNT', 'TK_NUMBER', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL'] def __parse_account_number_paren_number_line(self, line): return [ line[0].get_value(), self.string_utils.normalize_number(line[1].get_value()), -self.string_utils.normalize_number(line[3].get_value()) ] # ['TK_ACCOUNT', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_LEFT_PAREN', 'TK_NUMBER', 'TK_RIGHT_PAREN', 'TK_EOL'] def __parse_account_paren_number_paren_number_line(self, line): return [ line[0].get_value(), -self.string_utils.normalize_number(line[2].get_value()), -self.string_utils.normalize_number(line[5].get_value()) ]
def __init__(self, text): self.text = text self.head_splitted_account = None self.account_utils = AccountUtils() self.string_utils = StringUtils()
def setUp(self): self.string_utils = StringUtils()
def __init__(self): self.base_xpath = '//html/body/center' self.date_utils = DateUtils() self.string_utils = StringUtils()
def __init__(self): self.storage = SpiderStorage() self.string_utils = StringUtils()
def __init__(self): self.base_xpath = '//html/body[@id="content_d"]/center/table[@class="main_table hasBorder"]' self.string_utils = StringUtils()
class GeminiParser: def __init__(self): self.base_xpath = "//html/body/center" self.date_utils = DateUtils() self.string_utils = StringUtils() def parse(self, content): html_object = self.__get_html_object(content) relative_html_object = self.__traverse_to_relative_html_object(html_object) column_name_list = self.__parse_column_name_list(relative_html_object) row_list = self.__parse_row_list(relative_html_object) release_date = self.__parse_release_date(relative_html_object) return column_name_list, row_list, release_date def __get_html_object(self, content): content = self.string_utils.normalize_string(content) content = content.replace(u"<br>", u"") return lxml.html.fromstring(content) def __traverse_to_relative_html_object(self, html_object): relative_html_object_list = html_object.xpath(self.base_xpath) assert len(relative_html_object_list) > 0, "invalid base_xpath" return relative_html_object_list[0] def __parse_column_name_list(self, relative_html_object): # traverse and sanity check table_tags = relative_html_object.xpath("./table") assert len(table_tags) > 1, "invalid table_tags" # skip first table of description about IFRS inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table") assert len(inner_table_tags) > 0, "invalid inner_table_tags" tr_tags = inner_table_tags[0].xpath("./tr") assert len(tr_tags) > 1, "invalid tr_tags" th_texts = tr_tags[1].xpath("./th/text()") return th_texts def __parse_row_list(self, relative_html_object): # traverse and sanity check table_tags = relative_html_object.xpath("./table") assert len(table_tags) > 1, "invalid table_tags" # skip first table of description about IFRS inner_table_tags = table_tags[1].xpath("./tr/td/table/tr/td/table") assert len(inner_table_tags) > 0, "invalid inner_table_tags" all_tr_tags = [] # every inner_table represents an industry for inner_table_tag in inner_table_tags: tr_tags = inner_table_tag.xpath("./tr") assert len(tr_tags) > 2, "invalid tr_tags" # first two rows are headers # last row is u'合計' all_tr_tags += tr_tags[2:-1] return [self.__parse_row(tr_tag) for tr_tag in all_tr_tags] def __parse_row(self, relative_html_object): td_texts = relative_html_object.xpath("./td/text()") assert len(td_texts) == 10, "invalid td_texts size, should be 10" items = td_texts[:2] numbers = [] for td_text in td_texts[2:]: number = self.string_utils.normalize_number(td_text) numbers.append(number) return items + numbers def __parse_release_date(self, relative_html_object): div_tags = relative_html_object.xpath("./div") assert len(div_tags) > 0, "invalid div_tags" groups = self.string_utils.match(u"^出表日期:(.*)$", div_tags[-1].text.strip()) assert len(groups) > 0, "could not match ^出表日期:(.*)$" release_date = self.string_utils.from_local_string_to_date(groups[0]) return release_date