def __init__(self, stock_id): super().__init__(stock_id) self.__tag = "ShareholderEquityProcessor" self.__repository = MongoDBRepository(MongoDBMeta.SHARE_HOLDER) # self._data_fetcher = _ShareholderEquityFetcher() self.items_to_get = ('期初餘額', '期末餘額') self.fields_to_get = ('權益總額', )
def test_sync_performance(self): _sync_performance(2841) _repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PERFORMANCE) content = _repository.get_data(2841) print(content) print('index = ', content.index) print('columns = ', content.columns)
class StockCountProcessor: def __init__(self): self.__repository = MongoDBRepository(MongoDBMeta.STOCK_COUNT) def get_stock_count(self, stock_id, year): raw_data = self.__repository.get_data(stock_id, {'year': year}) if raw_data is None: fetch_stock_count_raw_data(stock_id, year, year) raw_data = self.__repository.get_data(stock_id, {'year': year}) if raw_data is None: return None bs = BeautifulSoup(raw_data, 'html.parser') table = bs.find_all(has_table_width_no_class) # print(bs.prettify()) # print(len(table)) # print(table[0].prettify()) if len(table) == 0: return None rows = table[0].find_all('tr') for row in rows: r = [ x.get_text().strip().replace(" ", "").replace(",", "") for x in row.find_all('td') ] print(r) if len(r) > 3 and r[1] == '合計': return int(r[3]) return 0 def get_data_frame(self, stock_id, since, to=None): if to is None or to < since: to = datetime.now().year stocks = [] end_year = since start_year = since for year in range(since, to + 1): stock_count = self.get_stock_count(stock_id, year) print("StockCountProcessor year = ", year, " stocks = ", stock_count) if stock_count is None: if start_year == year: start_year = start_year + 1 continue else: if len(stocks) > 0: break else: return stocks.append(stock_count) end_year = year period_index = pd.PeriodIndex(start=pd.Period(start_year, freq='Y'), end=pd.Period(end_year, freq='Y'), freq='Y') return pd.DataFrame(data={'股數': stocks}, index=period_index)
def test_fetch_data_utils(self): ''' stock_code_list = get_stock_codes(stock_type='上市') fetch_twse_price_measurement_raw_datas(stock_code_list[0: 1]) tpex_stock_code_list = get_stock_codes(stock_type='上櫃') fetch_tpex_price_measurement_raw_datas(tpex_stock_code_list[0:1]) result = MongoDBRepository(MongoDBMeta.TPEX_PRICE_MEASUREMENT).get_data(stock_code_list[0]) self.assertIsNotNone(result) ''' ''' fetch_dividend_policy_raw_datas(2884) result = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY).get_data(2884) self.assertIsNotNone(result) ''' # ''' fetch_shareholder_equity_raw_data(2884, 2020, 3) result = MongoDBRepository(MongoDBMeta.SHARE_HOLDER).get_data(2809, {'year': 2020, 'season': 3}) self.assertIsNotNone(result) # ''' ''' fetch_simple_balance_sheet_raw_data(2884, 2020, 3) result = MongoDBRepository(MongoDBMeta.SIMPLE_BALANCE_SHEET).get_data(2884, {'year': 2020, 'season': 3}) self.assertIsNotNone(result) ''' ''' fetch_balance_sheet_raw_data(2884, 2020, 3) result = MongoDBRepository(MongoDBMeta.FULL_BALANCE_SHEET).get_data(2884, {'year': 2020, 'season': 3}) self.assertIsNotNone(result) ''' '''
class TWSEPriceMeasurementTransformer: def __init__(self): self.__in_repository = MongoDBRepository(MongoDBMeta.TWSE_PRICE_MEASUREMENT) self.__out_repository = _data_frame_repository def transform_to_dataframe(self, stock_id): content = self.__in_repository.get_data(stock_id) # print(content['fields']) print('content = ', content) if content is None or content['stat'] != 'OK': return rows = [] indexes = [] _logger.info("TWSEPriceMeasurementTransformer transform " + str(stock_id)) for row_items in content['data']: row = [str(row_item).replace(',', '') for row_item in row_items] row[1] = int(row[1]) row[2] = int(row[2]) row[3] = int(row[3]) row[4] = float(row[4]) row[6] = float(row[6]) row[8] = float(row[8]) indexes.append(pd.Period(value=str(int(row[0]) + 1911))) rows.append(row[1:]) data_frame = pd.DataFrame(rows, index=indexes, columns=['成交股數', '成交金額', '成交筆數', '最高價', '日期', '最低價', '日期', '收盤平均價']) print(data_frame) self.__out_repository.put_data(stock_id, data_frame) return data_frame
def test_sync_statements(self): # ''' db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW) data_frame_before = db_repository.get_data(2841) data_frame_after = _sync_cash_flow_statement(2841, 2013, to_year=2021, df_cash_flow_statement=data_frame_before) print('before = ', data_frame_before) print('after = ', data_frame_after) # ''' # ''' db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT) data_frame_before = db_repository.get_data(2841) data_frame_after = _sync_profit_statement(2841, 2013, df_profit_statement=data_frame_before) print('before = ', data_frame_before) print('after = ', data_frame_after) # ''' # ''' db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET) data_frame_before = db_repository.get_data(2841) data_frame_after = _sync_balance_sheet(2841, 2013, 2019, df_balance_sheet=data_frame_before) print('before = ', data_frame_before) print('after = ', data_frame_after)
class TPEXPriceMeasurementTransformer: def __init__(self): self.__in_repository = MongoDBRepository(MongoDBMeta.TPEX_PRICE_MEASUREMENT) self.__out_repository = _data_frame_repository def transform_to_dataframe(self, stock_id): # collection = db[TABLE_TPEX_PRICE_MEASUREMENT] # record = collection.find_one({"stock_id": str(stock_id)}) record = self.__in_repository.get_data(stock_id) _logger.info("TWSEPriceMeasurementTransformer transform " + str(stock_id)) if record is not None: try: soup = BeautifulSoup(record, 'html.parser') table = soup.find('table', attrs={"class": "page-table-board"}) rows = [] indexes = [] for tr in table.find_all('tr'): if tr.find('td', attrs={"class": "page-table-body-center"}) is not None: tds = tr.find_all('td') row = [td.string.replace(',', '') for td in tds] row[0] = int(row[0]) + 1911 row[1] = int(row[1]) * 1000 row[2] = int(row[2]) * 1000 row[3] = int(row[3]) * 1000 row[4] = float(row[4]) row[6] = float(row[6]) row[8] = float(row[8]) indexes.append(pd.Period(str(row[0]))) rows.append(row[1:]) data_frame = pd.DataFrame(rows, index=indexes, columns=['成交股數', '成交金額', '成交筆數', '最高價', '日期', '最低價', '日期', '收盤平均價']) print(data_frame) self.__out_repository.put_data(stock_id, data_frame) return data_frame except Exception as inst: _logger.error("get exception in " + str(stock_id) + ":" + str(inst)) traceback.print_tb(inst.__traceback__)
def test_store_data_frames(self): # ''' df_cash_flow_before = _sync_cash_flow_statement(4564, 2013, to_year=2019) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW) db_repository.put_data(4564, df_cash_flow_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_profit_statement_before = _sync_profit_statement(4564, 2013, to_year=2019) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT) db_repository.put_data(4564, df_profit_statement_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_balance_sheet_before = _sync_balance_sheet(4564, 2013, to_year=2020) print(df_balance_sheet_before) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET) db_repository.put_data(4564, df_balance_sheet_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.columns) # ''' # ''' df_dividend_before = _sync_dividend_policy(4564, 2013) print(df_dividend_before) print(df_dividend_before.index) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY) db_repository.put_data(4564, df_dividend_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_dividend_before = _sync_dividend_policy(4564, 2013) print(df_dividend_before) print(type(df_dividend_before.index)) self.assertIsInstance(df_dividend_before.index, pd.PeriodIndex) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY) db_repository.put_data(4564, df_dividend_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) self.assertIsInstance(data_frame.index, pd.PeriodIndex)
class DividendPolicyProcessor2(StatementProcessor): def __init__(self): super().__init__(None) self.dividend_policy_fetcher = _DividendPolicyFetcher2() self.__repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY) def get_data_frame(self, year, season): pass def get_data_frames(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year): cache_df = self._parse_raw_data(stock_id, self.__repository.get_data(stock_id)) if cache_df is None or cache_df[str(to_year)].empty or cache_df[str(to_year)].isnull().values.all(): latest_raw_data = self._get_raw_data(stock_id, start_year=start_year, to_year=to_year) print('latest_raw_data = ', latest_raw_data) return self._parse_raw_data(stock_id=stock_id, raw_data=latest_raw_data) else: return cache_df def _parse_raw_data(self, stock_id, raw_data): try: soup = BeautifulSoup(raw_data, 'html.parser') table = soup.find('table', attrs={"class": "hasBorder", "width": "99%"}) data_frame = pd.read_html(str(table))[0] print('dividend data_frame = ', data_frame) except Exception as e: print('get', e, ' when get dividend policy') return None data_frame = data_frame.iloc[3:, :] period_list = list(map(lambda x: pd.Period(self.__parse_period(x)), data_frame.iloc[:, 1].tolist())) dividend_cash_list = list(map(lambda x: float(x), data_frame.iloc[:, 10].tolist())) dividend_cash_stock_list = list(map(lambda x: float(x), data_frame.iloc[:, 13].tolist())) dividend_record_version = list(map(lambda x: int(x), data_frame.iloc[:, 3].tolist())) meeting_progress = list(map(lambda x: str(x), data_frame.iloc[:, 0].tolist())) parse_dict = {} for index in range(0, len(period_list)): period = period_list[index] if parse_dict.get(period) is None: parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index], dividend_record_version[index]] else: print('duplicate ', period) if meeting_progress[index].find('股東會確認') != -1 and parse_dict[period][2] < dividend_record_version[index]: parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index], dividend_record_version[index]] period_list = parse_dict.keys() dividend_cash_list = [value[0] for value in parse_dict.values()] dividend_cash_stock_list = [value[1] for value in parse_dict.values()] dict_dividend = {'現金股利': dividend_cash_list, '配股': dividend_cash_stock_list} print(dict_dividend) now = datetime.now() def get_default_time_line_periods(): periods = [] for year in range(2013, now.year + 1): for quarter in range(1, 5): periods.append(pd.Period(str(year) + 'Q' + str(quarter))) return periods df_dividend = pd.DataFrame(dict_dividend, index=period_list).reindex(get_default_time_line_periods()).applymap( lambda x: pd.np.nan if pd.isnull(x) else x) print('df_dividend = ', df_dividend) dic_dividend = {} for year in range(2013, now.year + 1): df_extract = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:] df_extract_sum = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:].sum() isnan = all(pd.np.isnan(ele) for ele in list(itertools.chain(*df_extract.values))) dic_dividend[pd.Period(year)] = [pd.np.nan for ele in df_extract_sum] if isnan else df_extract_sum print('df_dividend 2 = ', dic_dividend) print("\n") df_dividend = pd.DataFrame(dic_dividend) df_dividend = df_dividend.T print('df_dividend 3 = ', df_dividend) return df_dividend def _get_raw_data(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year): result = self.dividend_policy_fetcher.fetch( {'stock_id': stock_id, 'start_year': start_year - 1911, 'to_year': to_year - 1911}) if result.ok is False: print('get content fail') return print('result content = ', result.content) self.__repository.put_data(stock_id, result.content) return result.content def __parse_period(self, period_string): if period_string.find("年年度") > -1: return str(int(period_string.replace("年年度", "")) + 1911) + "Q4" elif period_string.find("年上半年") > -1: return str(int(period_string.replace("年上半年", "")) + 1911) + "Q2" elif period_string.find("年下半年") > -1: return str(int(period_string.replace("年下半年", "")) + 1911) + "Q4" else: period_strings = period_string.replace("季", "").split("年第") return str((int(period_strings[0]) + 1911)) + "Q" + period_strings[1]
class ShareholderEquityProcessor(StatementProcessor): def __init__(self, stock_id): super().__init__(stock_id) self.__tag = "ShareholderEquityProcessor" self.__repository = MongoDBRepository(MongoDBMeta.SHARE_HOLDER) # self._data_fetcher = _ShareholderEquityFetcher() self.items_to_get = ('期初餘額', '期末餘額') self.fields_to_get = ('權益總額', ) def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) dfs = [] column_index = pd.MultiIndex.from_product( [self.fields_to_get, self.items_to_get], names=['first', 'second']) print(column_index) last_result = self._get_data_dict( time_lines[0].get('year'), time_lines[0].get('season') ) if len(time_lines) > 0 and time_lines[0].get('season') > 1 else None for time_line in time_lines: result = self._get_data_dict(time_line.get('year'), time_line.get('season')) if result is None: continue if last_result is not None: for key in result.keys(): result[key]['期初餘額'] = last_result[key]['期末餘額'] last_result = result print(result) str_period = "{}Q{}".format(time_line.get('year'), time_line.get('season')) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') data_list = [] for inner in result.values(): data_list.extend(inner.values()) print(data_list) dfs.append( pd.DataFrame([data_list], columns=column_index, index=period_index)) # return super().get_data_frames(since, to) print(self.__tag, "dfs = ", dfs) return pd.concat(dfs) if len(dfs) > 0 else None def get_data_frame(self, year, season): return self.get_data_frames(since={ 'year': year, 'season': season }, to={ 'year': year, 'season': season }) def _get_data_dict(self, year, season): raw_data = self.__repository.get_data(self._stock_id, { 'year': year, 'season': season }) if raw_data is None: fetch_shareholder_equity_raw_data(self._stock_id, year, season) raw_data = self.__repository.get_data(self._stock_id, { 'year': year, 'season': season }) if raw_data is not None: return self._parse_data(raw_data) def _parse_data(self, content): try: bs = BeautifulSoup(content, 'html.parser') # print(bs.prettify()) tables = bs.find_all('table', attrs={ "class": "hasBorder", "align": "center" }) if len(tables) < 1: print('ShareholderEquityProcessor - error 1') return None table = tables[0] # print(table.prettify()) rows = table.find_all('tr') headers = [] rows_data = [] for row in rows: columns_raw = [ column for column in row.contents if column != '\n' ] columns = [column.get_text().strip() for column in columns_raw] if len(columns) > 1: if columns_raw[0].name == 'th' and len(headers) == 0: headers = columns if not all(field in headers for field in self.fields_to_get): print('ShareholderEquityProcessor - error 2') return None else: rows_data.append(columns) rows_data = [ row_data for row_data in rows_data if row_data[0] in self.items_to_get ] result = { row_data[0]: { k: int(row_data[headers.index(k)].replace(',', '')) for k in self.fields_to_get } for row_data in rows_data } result2 = { key: {item: 0 for item in self.items_to_get} for key in self.fields_to_get } for key in result.keys(): for key2 in result[key]: result2[key2][key] = result[key][key2] print("result = ", result2) return result2 except Exception as inst: print("get exception", inst) traceback.print_tb(inst.__traceback__) return None
def __init__(self, stock_id): super().__init__(stock_id) self.__repository = MongoDBRepository(MongoDBMeta.CASH_FLOW) self._fetch_fields = ('營業活動之淨現金流入', '取得不動產、廠房及設備', '其他投資活動', '投資活動之淨現金流入')
class CashFlowStatementProcessor(StatementProcessor): """業主盈餘現金流 = 營業活動之淨現金流入 + 取得不動產、廠房及設備 + 其他投資活動 自由現金流 = 營業活動之淨現金流入 + 投資活動之淨現金流入""" def __init__(self, stock_id): super().__init__(stock_id) self.__repository = MongoDBRepository(MongoDBMeta.CASH_FLOW) self._fetch_fields = ('營業活動之淨現金流入', '取得不動產、廠房及設備', '其他投資活動', '投資活動之淨現金流入') def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) # time_first = time_lines[0] # if time_first.get('season') > 1: # time_lines.insert(0, {'year': time_first.get('year'), 'season': (time_first.get('season') - 1)}) # print(time_lines) time_lines.reverse() dfs = [] cache_data_dict = None for time_line in time_lines: print('In ', time_line) year = time_line.get('year') season = time_line.get('season') if cache_data_dict is None: data_dict = self._get_data_dict(self._fetch_fields, year, season) else: data_dict = cache_data_dict if data_dict is None: continue if season > 1: cache_data_dict = self._get_data_dict(self._fetch_fields, year, season - 1) if data_dict is None or cache_data_dict is None: print('get None value in year ', year, ' season ', season, " data_dict = ", data_dict, " cache_data_dic = ", cache_data_dict) else: for key in self._fetch_fields: data_dict[key] = data_dict.get( key, 0) - cache_data_dict.get(key, 0) else: cache_data_dict = None data_dict['業主盈餘現金流'] = data_dict.get('營業活動之淨現金流入', 0) + data_dict.get('取得不動產、廠房及設備', 0)\ + data_dict.get('其他投資活動', 0) data_dict['自由現金流'] = data_dict.get( '營業活動之淨現金流入', 0) + data_dict.get('投資活動之淨現金流入', 0) print(data_dict) str_period = "{}Q{}".format(year, season) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') dfs.append( pd.DataFrame([data_dict.values()], columns=data_dict.keys(), index=period_index)) return None if len(dfs) == 0 else pd.concat(dfs, sort=False) def get_data_frame(self, year, season, source_policy=Source.CACHE_ONLY): return self.get_data_frames(since={ 'year': year, 'season': season }, to={ 'year': year, 'season': season }, source_policy=source_policy) def _get_data_dict(self, fields, year, season): # result = self._data_fetcher.fetch(params={'stock_id': self._stock_id, 'year': year - 1911, 'season': season}) # if result.ok is False: # return None data_dict = {} try: raw_data = self.__repository.get_data(str(self._stock_id), { 'year': year, 'season': season }) if raw_data is None: fetch_cash_flow_raw_data(self._stock_id, year, season) raw_data = self.__repository.get_data(str(self._stock_id), { 'year': year, 'season': season }) # raw_data = get_raw_data(PATH_DIR_RAW_DATA_CASH_FLOW + str(year) + "Q" + str(season), str(self._stock_id)) bs = BeautifulSoup(raw_data, 'html.parser') table = bs.find_all('table', attrs={ "class": "hasBorder", "align": "center" }) #print(table[0].prettify()) rows = table[0].find_all('tr') for row in rows: r = [x.get_text() for x in row.find_all('td')] if len(r) == 0: continue for field in fields: if field in r[0]: data_dict[field] = int(r[1].replace(',', '')) break except Exception as inst: print("get exception", inst) traceback.print_tb(inst.__traceback__) return None # print(data_dict) return data_dict
import pandas as pd from evaluation_utils import get_stock_list from rdss.balance_sheet import SimpleBalanceSheetProcessor from rdss.cashflow_statment import CashFlowStatementProcessor from rdss.dividend_policy2 import DividendPolicyProcessor2 from rdss.shareholder_equity import ShareholderEquityProcessor from rdss.statement_fetchers import SimpleIncomeStatementProcessor from rdss.stock_count import StockCountProcessor from repository.mongodb_repository import MongoDBRepository, MongoDBMeta from stock_data import store_df, read_dfs from twse_crawler import gen_output_path from utils import get_time_lines from value_measurement import PriceMeasurementProcessor2 _cash_flow_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW) _profit_statement_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT) _balance_sheet_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET) _dividend_policy_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY) _performance_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PERFORMANCE) class Option(enum.IntEnum): BALANCE_SHEET = 1 PROFIT_STATEMENT = 1 << 2 CASH_FLOW_STATEMENT = 1 << 3 DIVIDEND_POLICY = 1 << 4 ALL = BALANCE_SHEET | PROFIT_STATEMENT | CASH_FLOW_STATEMENT | DIVIDEND_POLICY def sync_statements(stock_codes, times_to_retry=10, break_after_retry=True, option=Option.ALL, isSync=True):
PATH_DIR_RAW_DATA_SHAREHOLDER_EQUITY = "out/raw_datas/shareholder_equity/" PATH_DIR_RAW_DATA_DIVIDEND_POLICY = "out/raw_datas/dividend_policy" PATH_DIR_RAW_DATA_STOCK_COUNT = "out/raw_datas/stock_count/" PATH_DIR_RAW_DATA_CASH_FLOW = "out/raw_datas/cash_flow/" PATH_DIR_RAW_DATA_PRICE_MEASUREMENT = "out/raw_datas/price_measurement/" PATH_DIR_RAW_DATA_TPEX_PRICE_MEASUREMENT = "out/raw_datas/tpex_price_measurement" __balance_sheet_data_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t164sb03') __simple_balance_sheet_data_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t163sb01') __shareholder_equity_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t164sb06') __dividend_policy_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t05st09_2') __stock_count_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t16sn02') __cash_flow_fetcher = DataFetcher('https://mops.twse.com.tw/mops/web/ajax_t164sb05') __stock_count_repository = MongoDBRepository(MongoDBMeta.STOCK_COUNT) __twse_price_measurement_repository = MongoDBRepository(MongoDBMeta.TWSE_PRICE_MEASUREMENT) __tpex_price_measurement_repository = MongoDBRepository(MongoDBMeta.TPEX_PRICE_MEASUREMENT) __dividend_policy_repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY) __shareholder_repository = MongoDBRepository(MongoDBMeta.SHARE_HOLDER) __simple_balance_sheet_repository = MongoDBRepository(MongoDBMeta.SIMPLE_BALANCE_SHEET) __full_balance_sheet_repository = MongoDBRepository(MongoDBMeta.FULL_BALANCE_SHEET) __cash_flow_repository = MongoDBRepository(MongoDBMeta.CASH_FLOW) __logger = logging.getLogger("twse.DataFetcher") # mongo_client = MongoClient('localhost', 27017) mongo_client = MongoClient('192.168.1.109', 27017) DB_TWSE = "TWSE" TABLE_TWSE_PRICE_MEASUREMENT = "twse_price_measurement" TABLE_TPEX_PRICE_MEASUREMENT = "tpex_price_measurement"
def __init__(self): self.__in_repository = MongoDBRepository(MongoDBMeta.TPEX_PRICE_MEASUREMENT) self.__out_repository = _data_frame_repository
class SimpleIncomeStatementProcessor: def __init__(self): self.__repository = MongoDBRepository(MongoDBMeta.SIMPLE_BALANCE_SHEET) def get_data_frames(self, stock_id, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) year = time_lines[0].get('year') season = time_lines[0].get('season') last_result = self._get_data_dict(stock_id, year, season - 1) if season > 1 else None dfs = [] for time_line in time_lines: data_dict = self._get_data_dict(stock_id, time_line.get('year'), time_line.get('season')) if data_dict is None: continue if last_result is not None: result = { k: (v - last_result[k]) for (k, v) in data_dict.items() } else: result = data_dict print('result = ', result, ' last_result', last_result) last_result = None if time_line.get('season') == 4 else data_dict str_period = "{}Q{}".format(time_line.get('year'), time_line.get('season')) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') dfs.append( pd.DataFrame([result.values()], columns=result.keys(), index=period_index)) return pd.concat(dfs) if len(dfs) > 0 else None def get_data_frame(self, stock_id, year, season, source_policy=Source.CACHE_ONLY): return self.get_data_frames(stock_id=stock_id, since={ 'year': year, 'season': season }, to={ 'year': year, 'season': season }, source_policy=source_policy) def _get_data_dict(self, stock_id, year, season): # result = self.__data_fetcher.fetch({'stock_id': self._stock_id, 'year': year - 1911, 'season': season}) # if result.ok is False: # return None try: dict_datas = {} raw_data = self.__repository.get_data(stock_id, { 'year': year, 'season': season }) if raw_data is None: fetch_simple_balance_sheet_raw_data(stock_id, year, season) raw_data = self.__repository.get_data(stock_id, { 'year': year, 'season': season }) bs = BeautifulSoup(raw_data, 'html.parser') print(' get ', bs.text) tables = bs.find_all('table', attrs={ "class": "hasBorder", "align": "center", "width": "70%" }) table = tables[2] rows = table.find_all('tr') for row in rows: r = [x.get_text() for x in row.find_all('td')] # print(r) if '每股盈餘' in r[0]: dict_datas['EPS'] = float(r[1]) if '本期綜合損益總額' in r[0]: dict_datas['稅後淨利'] = int(r[1].replace(',', '')) return dict_datas except Exception as inst: print("get exception", inst, " when get data in year ", year, ' and season ', season) traceback.print_tb(inst.__traceback__) return None
def __init__(self): self.__repository = MongoDBRepository(MongoDBMeta.SIMPLE_BALANCE_SHEET)
def __init__(self): self.__repository = MongoDBRepository(MongoDBMeta.STOCK_COUNT)
def __init__(self): super().__init__(None) self.dividend_policy_fetcher = _DividendPolicyFetcher2() self.__repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY)
print(revamp_list) return revamp_list def __data_frame_in_transform(content): data_frame = pd.read_json(content, orient='split', typ='frame') print(data_frame.index.values) index_dict = {item: pd.Period(value=str(item)) for item in data_frame.index.values} new_data_frame = data_frame.rename(index_dict) return new_data_frame __data_frame_repository_transformer = Transformer(in_transform=yearly_period_data_frame_in_transform, out_transform=default_data_frame_out_transform) _data_frame_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PRICE_MEASUREMENT, transformer=__data_frame_repository_transformer) class PriceMeasurementProcessor2: def __init__(self): self.__twsePriceTransformer = TWSEPriceMeasurementTransformer() self.__tpexPriceTransformer = TPEXPriceMeasurementTransformer() from evaluation_utils import get_stock_codes self.list_twse = get_stock_codes(stock_type='上市') self.list_tpex = get_stock_codes(stock_type='上櫃') def get_data_frame(self, stock_id): data_frame = _data_frame_repository.get_data(stock_id) current_years = None if data_frame is not None: current_years = list(map(lambda year_index: int(year_index.year), data_frame.index.values)) current_years.sort()