def move_quarterly_raw_datas_to_db(raw_data_path, table_name): code_list = get_stock_codes(stock_type='上市') + get_stock_codes( stock_type='上櫃') time_lines = get_time_lines(since={'year': 2013}) db = mongo_client[DB_TWSE] for stock_id in code_list: for time_line in time_lines: print('put ', stock_id, ' time_line = ', time_line) year = time_line['year'] season = time_line.get('season') raw_data = get_raw_data( raw_data_path + str(year) + "Q" + str(season), str(stock_id)) if raw_data is not None: collection = db[table_name] collection.find_one_and_update( { '$and': [{ 'stock_id': str(stock_id) }, { 'time_line': time_line }] }, {'$set': { "content": raw_data }}, upsert=True)
def move_stock_count_raw_datas_to_db(): code_list = get_stock_codes(stock_type='上市') + get_stock_codes( stock_type='上櫃') time_lines = get_time_lines(since={'year': 2013}, to={'year': 2020}, offset=Offset.YEAR) db = mongo_client[DB_TWSE] for stock_id in code_list: for time_line in time_lines: year = time_line['year'] raw_data = get_raw_data(PATH_DIR_RAW_DATA_STOCK_COUNT + str(year), str(stock_id)) if raw_data is not None: collection = db[TABLE_STOCK_COUNT] collection.find_one_and_update( { '$and': [{ 'stock_id': str(stock_id) }, { 'time_line': time_line }] }, {'$set': { "content": raw_data }}, upsert=True)
def _sync_balance_sheet(stock_id, start_year, to_year=None, df_balance_sheet=None): to = {'year': to_year} if to_year is not None else None balance_sheet_processor = SimpleBalanceSheetProcessor(stock_id) shareholder_equity_processor = ShareholderEquityProcessor(stock_id) if df_balance_sheet is not None: time_lines = get_time_lines(since={'year': start_year}, to=to) dfs_get = [] for time_line in time_lines: row_index = "{}Q{}".format(time_line['year'], time_line['season']) val = df_balance_sheet.get(row_index, None) is_empty = val is None or len(val.values) == 0 if is_empty: df_balance = balance_sheet_processor.get_data_frame(time_line['year'], time_line['season']) df_shareholder = shareholder_equity_processor.get_data_frame(time_line['year'], time_line['season']) df_combine = None if df_balance is None or df_shareholder is None else df_balance.join(df_shareholder, how='outer') if df_combine is not None: dfs_get.append(df_combine) if len(dfs_get) > 0: dfs_get.append(df_balance_sheet) df_balance_sheet = pd.concat(dfs_get, sort=False).sort_index() return df_balance_sheet else: df_balance_statement = balance_sheet_processor.get_data_frames({'year': start_year - 1}, to=to) df_shareholder_equity = shareholder_equity_processor.get_data_frames({'year': start_year - 1}, to=to) if df_balance_statement is None or df_shareholder_equity is None: return None df_combine = df_balance_statement.join(df_shareholder_equity, how='outer') indexes = df_combine[df_combine['每股淨值'].isna()].index df_combine.drop(indexes, inplace=True) print('合併 = ', df_combine) return df_combine
def fetch_simple_balance_sheet_raw_datas(stock_ids, time_lines=get_time_lines(since={'year': 2013})): def fetcher(stock_id, year, season): result = __simple_balance_sheet_data_fetcher.fetch( {"encodeURIComponent": 1, "step": 1, "firstin": 1, "off": 1, "queryName": "co_id", "inpuType": "co_id", "TYPEK": "all", "isnew": "false", "co_id": stock_id, "year": year - 1911, "season": season}) has_result = not (any(element.get_text() == "查詢無資料" for element in BeautifulSoup(result.content, 'html.parser').find_all('font'))) return result.content if has_result else None # __fetch_datas_and_store(stock_ids, time_lines, PATH_DIR_RAW_DATA_SIMPLE_BALANCE_SHEETS, fetcher) __fetch_datas_and_store2(stock_ids, time_lines, fetcher, __simple_balance_sheet_repository)
def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) dfs = [] for time_line in time_lines: data_frame = self.get_data_frame(time_line.get('year'), time_line.get('season'), source_policy) if data_frame is not None: dfs.append(data_frame) # return return pd.concat(dfs, sort=True) if len(dfs) > 0 else None
def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) # time_first = time_lines[0] # if time_first.get('season') > 1: # time_lines.insert(0, {'year': time_first.get('year'), 'season': (time_first.get('season') - 1)}) # print(time_lines) time_lines.reverse() dfs = [] cache_data_dict = None for time_line in time_lines: print('In ', time_line) year = time_line.get('year') season = time_line.get('season') if cache_data_dict is None: data_dict = self._get_data_dict(self._fetch_fields, year, season) else: data_dict = cache_data_dict if data_dict is None: continue if season > 1: cache_data_dict = self._get_data_dict(self._fetch_fields, year, season - 1) if data_dict is None or cache_data_dict is None: print('get None value in year ', year, ' season ', season, " data_dict = ", data_dict, " cache_data_dic = ", cache_data_dict) else: for key in self._fetch_fields: data_dict[key] = data_dict.get( key, 0) - cache_data_dict.get(key, 0) else: cache_data_dict = None data_dict['業主盈餘現金流'] = data_dict.get('營業活動之淨現金流入', 0) + data_dict.get('取得不動產、廠房及設備', 0)\ + data_dict.get('其他投資活動', 0) data_dict['自由現金流'] = data_dict.get( '營業活動之淨現金流入', 0) + data_dict.get('投資活動之淨現金流入', 0) print(data_dict) str_period = "{}Q{}".format(year, season) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') dfs.append( pd.DataFrame([data_dict.values()], columns=data_dict.keys(), index=period_index)) return None if len(dfs) == 0 else pd.concat(dfs, sort=False)
def fetch_stock_count_raw_datas(stock_ids, since_year=datetime.now().year, to_year=datetime.now().year): time_lines = get_time_lines(since={'year': since_year}, to={'year': to_year}, offset=Offset.YEAR) def fetcher(stock_id, year): result = __stock_count_fetcher.fetch( {'encodeURIComponent': 1, 'step': 1, 'firstin': 1, 'off': 1, 'queryName': 'co_id', 't05st29_c_ifrs': 'N', 't05st30_c_ifrs': 'N', 'inpuType': 'co_id', 'TYPEK': 'all', 'isnew': 'false', 'co_id': stock_id, 'year': (year - 1911)} ) return result.content # __fetch_datas_and_store(stock_ids, time_lines, PATH_DIR_RAW_DATA_STOCK_COUNT, fetcher) __fetch_datas_and_store2(stock_ids, time_lines, fetcher, __stock_count_repository)
def get_predict_roe_by_relative(stock_id): now_year = datetime.now().year time_lines = get_time_lines(since={'year': now_year, 'season': 1}) # list_temp_times = [time for time in time_lines[::-1] if _get_for_times(stock_id, [time]) is not None] # print(list_temp_times) last_time_available = next((time for time in time_lines[::-1] if _get_for_times(stock_id, [time]) is not None), None) print('last_time_available = ', last_time_available) if last_time_available is None: return None roe_current = _get_for_times(stock_id, time_lines[0: time_lines.index(last_time_available) + 1]) roe_last_year_relative = _get_for_times(stock_id, get_time_lines(since={'year': now_year - 1, 'season': 1}, to={'year': now_year - 1, 'season': last_time_available.get('season')})) roe_last_year = get_roe_in_year(stock_id, now_year - 1) print('roe_current = ', roe_current, ' roe_last_year_relative = ', roe_last_year_relative, ' roe_last_year = ', roe_last_year) if roe_current is None or roe_last_year_relative is None or roe_last_year is None: return None roe_relative = roe_last_year * (roe_current / roe_last_year_relative) print('roe_relative = ', roe_relative) return roe_relative
def fetch_balance_sheet_raw_datas(stock_ids, time_lines=get_time_lines(since={'year': 2013})): def fetcher(stock_id, year, season): result = __balance_sheet_data_fetcher.fetch( {"encodeURIComponent": 1, "step": 1, "firstin": 1, "off": 1, "queryName": "co_id", "inpuType": "co_id", "TYPEK": "all", "isnew": "false", "co_id": stock_id, "year": year - 1911, "season": season}) content = BeautifulSoup(result.content, 'html.parser').find_all('input') need_to_get_next = any(field['type'] == 'button' for field in content) if need_to_get_next: result = __balance_sheet_data_fetcher.fetch( {"encodeURIComponent": 1, "step": 2, "firstin": 1, "TYPEK": "sii", "co_id": stock_id, "year": year - 1911, "season": season}) has_result = not (any(element.get_text() == "查無所需資料!" for element in BeautifulSoup(result.content, 'html.parser').find_all('font'))) return result.content if has_result else None __fetch_datas_and_store2(stock_ids, time_lines, fetcher, __full_balance_sheet_repository)
def fetch_cash_flow_raw_datas(stock_ids, time_lines=get_time_lines(since={'year': 2013})): def fetcher(stock_id, year, season): result = __cash_flow_fetcher.fetch( {'encodeURIComponent': 1, 'step': 1, 'firstin': 1, 'off': 1, 'queryName': 'co_id', 'inpuType': 'co_id', 'TYPEK': 'all', 'isnew': 'false', 'co_id': stock_id, 'year': year - 1911, 'season': season} ) inputs_tag = BeautifulSoup(result.content, 'html.parser').find_all('input') need_to_get_next = any(field['type'] == 'button' for field in inputs_tag) if need_to_get_next: result = __cash_flow_fetcher.fetch( {"encodeURIComponent": 1, "step": 2, "firstin": 1, "TYPEK": "sii", "co_id": stock_id, "year": year - 1911, "season": season} ) has_result = not (any(element.get_text() == "查詢無資料" or element.get_text() == '查無所需資料!' for element in BeautifulSoup(result.content, 'html.parser').find_all('font'))) return result.content if has_result else None __fetch_datas_and_store2(stock_ids, time_lines, fetcher, __cash_flow_repository)
def fetch_shareholder_equity_raw_datas(stock_ids, time_lines=get_time_lines(since={'year': 2013})): def fetcher(stock_id, year, season): result = __shareholder_equity_fetcher.fetch( {'encodeURIComponent': 1, 'step': 1, 'firstin': 1, 'off': 1, 'queryName': 'co_id', 'inpuType': 'co_id', 'TYPEK': 'all', 'isnew': 'false', 'co_id': stock_id, 'year': year - 1911, 'season': season}) parser = BeautifulSoup(result.content, 'html.parser') has_result = not (any(element.get_text() == "查無資料!" for element in parser.find_all('font'))) has_button = len(parser.find_all('input')) > 0 if has_result and has_button: result = __shareholder_equity_fetcher.fetch( {'encodeURIComponent': 1, 'TYPEK': 'sii', 'step': 2, 'year': year - 1911, 'season': season, 'co_id': stock_id, 'firstin': 1}) parser = BeautifulSoup(result.content, 'html.parser') has_result = not (any(element.get_text() == "查無資料!" for element in parser.find_all('font'))) print('has result = ', has_result) return result.content if has_result else None # __fetch_datas_and_store(stock_ids, time_lines, PATH_DIR_RAW_DATA_SHAREHOLDER_EQUITY, fetcher) __fetch_datas_and_store2(stock_ids, time_lines, fetcher, __shareholder_repository)
def _sync_profit_statement(stock_id, start_year, to_year=None, df_profit_statement=None): to = {'year': to_year} if to_year is not None else None income_statement_processor = SimpleIncomeStatementProcessor() if df_profit_statement is None: df_profit_statement = income_statement_processor.get_data_frames(stock_id, {'year': start_year - 1}, to) else: time_lines = get_time_lines(since={'year': start_year}, to=to) dfs_get = [] for time_line in time_lines: row_index = "{}Q{}".format(time_line['year'], time_line['season']) val = df_profit_statement.get(row_index, None) is_empty = val is None or len(val.values) == 0 if is_empty: df_statement = income_statement_processor.get_data_frame(stock_id, time_line['year'], time_line['season']) if df_statement is not None: dfs_get.append(df_statement) if len(dfs_get) > 0: dfs_get.append(df_profit_statement) df_profit_statement = pd.concat(dfs_get, sort=False).sort_index() return df_profit_statement
def _sync_cash_flow_statement(stock_id, start_year, to_year=None, df_cash_flow_statement=None): cash_flow_processor = CashFlowStatementProcessor(stock_id) to = {'year': to_year} if to_year is not None else None if df_cash_flow_statement is None: df_cash_flow_statement = cash_flow_processor.get_data_frames({'year': start_year - 1}, to=to) else: time_lines = get_time_lines(since={'year': start_year}, to=to) dfs_get = [] index_string_list = df_cash_flow_statement.index.map(str).values.tolist() for time_line in time_lines: row_index = "{}Q{}".format(time_line['year'], time_line['season']) if not (row_index in index_string_list): df_statement = cash_flow_processor.get_data_frame(time_line['year'], time_line['season']) if df_statement is not None: dfs_get.append(df_statement) if len(dfs_get) > 0: dfs_get.append(df_cash_flow_statement) df_cash_flow_statement = pd.concat(dfs_get, sort=False).sort_index() return df_cash_flow_statement
def get_data_frames(self, stock_id, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) year = time_lines[0].get('year') season = time_lines[0].get('season') last_result = self._get_data_dict(stock_id, year, season - 1) if season > 1 else None dfs = [] for time_line in time_lines: data_dict = self._get_data_dict(stock_id, time_line.get('year'), time_line.get('season')) if data_dict is None: continue if last_result is not None: result = { k: (v - last_result[k]) for (k, v) in data_dict.items() } else: result = data_dict print('result = ', result, ' last_result', last_result) last_result = None if time_line.get('season') == 4 else data_dict str_period = "{}Q{}".format(time_line.get('year'), time_line.get('season')) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') dfs.append( pd.DataFrame([result.values()], columns=result.keys(), index=period_index)) return pd.concat(dfs) if len(dfs) > 0 else None
def get_data_frames(self, since, to=None, source_policy=Source.CACHE_ONLY): time_lines = get_time_lines(since=since, to=to) dfs = [] column_index = pd.MultiIndex.from_product( [self.fields_to_get, self.items_to_get], names=['first', 'second']) print(column_index) last_result = self._get_data_dict( time_lines[0].get('year'), time_lines[0].get('season') ) if len(time_lines) > 0 and time_lines[0].get('season') > 1 else None for time_line in time_lines: result = self._get_data_dict(time_line.get('year'), time_line.get('season')) if result is None: continue if last_result is not None: for key in result.keys(): result[key]['期初餘額'] = last_result[key]['期末餘額'] last_result = result print(result) str_period = "{}Q{}".format(time_line.get('year'), time_line.get('season')) period_index = pd.PeriodIndex(start=pd.Period(str_period, freq='Q'), end=pd.Period(str_period, freq='Q'), freq='Q') data_list = [] for inner in result.values(): data_list.extend(inner.values()) print(data_list) dfs.append( pd.DataFrame([data_list], columns=column_index, index=period_index)) # return super().get_data_frames(since, to) print(self.__tag, "dfs = ", dfs) return pd.concat(dfs) if len(dfs) > 0 else None
def get_roe_in_year(stock_id, year): time_lines = get_time_lines(since={'year': year, 'season': 1}, to={'year': year, 'season': 4}) roe = _get_for_times(stock_id, time_lines=time_lines) print("get in year ", year, ":", roe) return roe