def test_store_data_frames(self): # ''' df_cash_flow_before = _sync_cash_flow_statement(4564, 2013, to_year=2019) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_CASH_FLOW) db_repository.put_data(4564, df_cash_flow_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_profit_statement_before = _sync_profit_statement(4564, 2013, to_year=2019) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_PROFIT_STATEMENT) db_repository.put_data(4564, df_profit_statement_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_balance_sheet_before = _sync_balance_sheet(4564, 2013, to_year=2020) print(df_balance_sheet_before) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_BALANCE_SHEET) db_repository.put_data(4564, df_balance_sheet_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.columns) # ''' # ''' df_dividend_before = _sync_dividend_policy(4564, 2013) print(df_dividend_before) print(df_dividend_before.index) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY) db_repository.put_data(4564, df_dividend_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) # ''' # ''' df_dividend_before = _sync_dividend_policy(4564, 2013) print(df_dividend_before) print(type(df_dividend_before.index)) self.assertIsInstance(df_dividend_before.index, pd.PeriodIndex) db_repository = MongoDBRepository(MongoDBMeta.DATAFRAME_DIVIDEND_POLICY) db_repository.put_data(4564, df_dividend_before) data_frame = db_repository.get_data(4564) print(data_frame) print(data_frame.index) self.assertIsInstance(data_frame.index, pd.PeriodIndex)
class DividendPolicyProcessor2(StatementProcessor): def __init__(self): super().__init__(None) self.dividend_policy_fetcher = _DividendPolicyFetcher2() self.__repository = MongoDBRepository(MongoDBMeta.DIVIDEND_POLICY) def get_data_frame(self, year, season): pass def get_data_frames(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year): cache_df = self._parse_raw_data(stock_id, self.__repository.get_data(stock_id)) if cache_df is None or cache_df[str(to_year)].empty or cache_df[str(to_year)].isnull().values.all(): latest_raw_data = self._get_raw_data(stock_id, start_year=start_year, to_year=to_year) print('latest_raw_data = ', latest_raw_data) return self._parse_raw_data(stock_id=stock_id, raw_data=latest_raw_data) else: return cache_df def _parse_raw_data(self, stock_id, raw_data): try: soup = BeautifulSoup(raw_data, 'html.parser') table = soup.find('table', attrs={"class": "hasBorder", "width": "99%"}) data_frame = pd.read_html(str(table))[0] print('dividend data_frame = ', data_frame) except Exception as e: print('get', e, ' when get dividend policy') return None data_frame = data_frame.iloc[3:, :] period_list = list(map(lambda x: pd.Period(self.__parse_period(x)), data_frame.iloc[:, 1].tolist())) dividend_cash_list = list(map(lambda x: float(x), data_frame.iloc[:, 10].tolist())) dividend_cash_stock_list = list(map(lambda x: float(x), data_frame.iloc[:, 13].tolist())) dividend_record_version = list(map(lambda x: int(x), data_frame.iloc[:, 3].tolist())) meeting_progress = list(map(lambda x: str(x), data_frame.iloc[:, 0].tolist())) parse_dict = {} for index in range(0, len(period_list)): period = period_list[index] if parse_dict.get(period) is None: parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index], dividend_record_version[index]] else: print('duplicate ', period) if meeting_progress[index].find('股東會確認') != -1 and parse_dict[period][2] < dividend_record_version[index]: parse_dict[period] = [dividend_cash_list[index], dividend_cash_stock_list[index], dividend_record_version[index]] period_list = parse_dict.keys() dividend_cash_list = [value[0] for value in parse_dict.values()] dividend_cash_stock_list = [value[1] for value in parse_dict.values()] dict_dividend = {'現金股利': dividend_cash_list, '配股': dividend_cash_stock_list} print(dict_dividend) now = datetime.now() def get_default_time_line_periods(): periods = [] for year in range(2013, now.year + 1): for quarter in range(1, 5): periods.append(pd.Period(str(year) + 'Q' + str(quarter))) return periods df_dividend = pd.DataFrame(dict_dividend, index=period_list).reindex(get_default_time_line_periods()).applymap( lambda x: pd.np.nan if pd.isnull(x) else x) print('df_dividend = ', df_dividend) dic_dividend = {} for year in range(2013, now.year + 1): df_extract = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:] df_extract_sum = df_dividend.loc[pd.Period(str(year) + 'Q1'):pd.Period(str(year) + 'Q4'),:].sum() isnan = all(pd.np.isnan(ele) for ele in list(itertools.chain(*df_extract.values))) dic_dividend[pd.Period(year)] = [pd.np.nan for ele in df_extract_sum] if isnan else df_extract_sum print('df_dividend 2 = ', dic_dividend) print("\n") df_dividend = pd.DataFrame(dic_dividend) df_dividend = df_dividend.T print('df_dividend 3 = ', df_dividend) return df_dividend def _get_raw_data(self, stock_id, start_year=datetime.now().year, to_year=datetime.now().year): result = self.dividend_policy_fetcher.fetch( {'stock_id': stock_id, 'start_year': start_year - 1911, 'to_year': to_year - 1911}) if result.ok is False: print('get content fail') return print('result content = ', result.content) self.__repository.put_data(stock_id, result.content) return result.content def __parse_period(self, period_string): if period_string.find("年年度") > -1: return str(int(period_string.replace("年年度", "")) + 1911) + "Q4" elif period_string.find("年上半年") > -1: return str(int(period_string.replace("年上半年", "")) + 1911) + "Q2" elif period_string.find("年下半年") > -1: return str(int(period_string.replace("年下半年", "")) + 1911) + "Q4" else: period_strings = period_string.replace("季", "").split("年第") return str((int(period_strings[0]) + 1911)) + "Q" + period_strings[1]