def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) # 取到2年之前的数据 # Invested Capital = 资产总计121 - 流动负债101+ 应付票据68 + 短期借款109 + 一年内到期的长期负债0 bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date,['ticker', 121, 101, 68, 109, 0]] bs['IC'] = bs[121] - bs[101] + bs[68] + bs[109] + bs[0] bs = bs.drop([121, 101, 68, 109, 0], axis=1) self.bs = bs.dropna() # EBT = 归母净利润40 + 财务费用56 inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date,['ticker', 40, 56]] inst = inst[(inst[56] > 1) | (inst[56] < -1)].copy() inst['return'] = inst[40] + inst[56] inst = inst.drop([40, 56], axis=1) inst.dropna(inplace=True) inst['release_date'] = inst.index inst['report_date'] = inst.index returnTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 return_df = ttmContinues(inst[inst['ticker'] == ticker], 'return') return_df['ticker'] = ticker except: # print(ticker + ': revenue error') continue returnTTM_ls.append(return_df) self.inst = pd.concat(returnTTM_ls) self.inst.set_index('datetime', inplace=True)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 800) bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 86]] bs['release_date'] = bs.index bs['report_date'] = bs.index bs['motherEquity'] = bs[86] # 归母权益 equity_mean = [] for ticker in bs['ticker'].unique(): try: tmp_equity = ttmDiscrete(bs[bs['ticker'] == ticker], 'motherEquity', 5) tmp_equity['ticker'] = ticker except: continue equity_mean.append(tmp_equity) equity_mean = pd.concat(equity_mean) inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 40]] inst['release_date'] = inst.index inst['report_date'] = inst.index inst['motherNetProfit'] = inst[40] # 归母净利润 net_profit = [] for ticker in inst['ticker'].unique(): try: tmp_profit = ttmContinues(inst[inst['ticker'] == ticker], 'motherNetProfit') tmp_profit['ticker'] = ticker except: continue net_profit.append(tmp_profit) net_profit = pd.concat(net_profit) # 时间排序处理 equity_mean['report_date'] = equity_mean['report_date'].apply( lambda x: x.strftime("%Y-%m-%d")) net_profit['report_date'] = net_profit['report_date'].apply( lambda x: x.strftime("%Y-%m-%d")) self.equity_mean = equity_mean.sort_values( by=['report_date', 'datetime'], ascending=[False, False]) self.net_profit = net_profit.sort_values( by=['report_date', 'datetime'], ascending=[False, False])
def prepare_data(self, begin_date, end_date): """ 制作因子的数据准备 :param begin_date: :param end_date: :return: """ shifted_begin_date = shift_date(begin_date, self.factor_param['lagTradeDays']) hq = cp.concat_stock(self.data_source, self.tickers).loc[shifted_begin_date:end_date, ['code', 'close']] self.hq = cp.hconcat_stock_series(hq, self.tickers)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ # 净资产周转率 = 营业收入_TTM / 净资产总计_TTM # 净资产总计=总资产-负债总额 # 营业收入_TTM为最近4个季度报告期的营业收入之和, # 净资产总计_TTM为最近5个季度报告期总资产的平均值。 # Net asset turnover ratio = netAssets / totalLiabilities # 获取财务数据: shifted_begin_date = shift_date(begin_date, 500) #117负债, 121资产 netAssets = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 117, 121]] netAssets['netAssets'] = netAssets[121] - netAssets[117] netAssets.drop([117, 121], axis=1, inplace=True) netAssets = netAssets[netAssets['netAssets'] :0] netAssets['report_date'] = netAssets.index netAssets['release_date'] = netAssets.index netAssetsTTM_ls = [] for ticker in netAssets['ticker'].unique(): try: netAssets_df = ttmDiscrete(netAssets[netAssets['ticker'] == ticker], 'netAssets') netAssets_df['ticker'] = ticker except: # print(ticker + ': net asset error') continue netAssetsTTM_ls.append(netAssets_df) #0营业收入 revenue = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 0]] revenue['revenue'] = revenue[0] revenue.drop([0], axis=1, inplace=True) revenue['report_date'] = revenue.index revenue['release_date'] = revenue.index revenueTTM_ls = [] for ticker in revenue['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(revenue[revenue['ticker'] == ticker], 'revenue') reven_df['ticker'] = ticker except: # print(ticker + ': revenue error') continue revenueTTM_ls.append(reven_df) self.revenueTTM = pd.concat(revenueTTM_ls) self.netAssetsTTM = pd.concat(netAssetsTTM_ls)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ # 获取财务数据: # CATurnover = currentAssets 103 / revenue 0 shifted_begin_date = shift_date(begin_date, 500) bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 103]] bs['release_date'] = bs.index bs['report_date'] = bs.index bs['currentAssets'] = bs[103] bs.drop(103, axis=1, inplace=True) inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 0]] inst['release_date'] = inst.index inst['report_date'] = inst.index inst['revenue'] = inst[0] inst.drop([0], axis=1, inplace=True) # TTM Continues处理 revenueTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(inst[inst['ticker'] == ticker], 'revenue') reven_df['ticker'] = ticker except: print(ticker + ': revenue error') continue revenueTTM_ls.append(reven_df) # TTM Discrete 取近期平均 currentAssetsTTM_ls = [] for ticker in bs['ticker'].unique(): try: currentAssets_df = ttmDiscrete(bs[bs['ticker'] == ticker], 'currentAssets') currentAssets_df['ticker'] = ticker except: print(ticker + ': current asset error') continue currentAssetsTTM_ls.append(currentAssets_df) self.revenueTTM = pd.concat(revenueTTM_ls) self.currentAssetsTTM = pd.concat(currentAssetsTTM_ls)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ shifted_begin_date = shift_date(begin_date, 500) inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date,['ticker', 40]] inst['motherNetProfit'] = inst[40] inst.drop(40, axis=1, inplace=True) inst['release_date'] = inst.index inst['report_date'] = inst.index profitTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(inst[inst['ticker'] == ticker], 'motherNetProfit') reven_df['ticker'] = ticker except: continue profitTTM_ls.append(reven_df) # 净利润ttm self.profitTTM = pd.concat(profitTTM_ls) # self.profitTTM.set_index('datetime', inplace=True) # 总市值 # Tushare的市值数据只有17年-now df = market_value(self.data_source + '\\other\\otherdata.csv', self.tickers) self.mkt_value = df.drop(['price', 'totals'], axis=1)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ shifted_begin_date = shift_date(begin_date, 500) # 向前取500个交易日 # 取利润表中“归属于母公司股东的净利润”项目,项目名称及数字详见FundDict inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date,['ticker', 40]] inst['motherNetProfit'] = inst[40] inst.drop(40, axis=1, inplace=True) # ttm算法需要“财报发布日”与“财报报告日”两个日期作为参数 inst['release_date'] = inst.index inst['report_date'] = inst.index # 净利润ttm profitTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(inst[inst['ticker'] == ticker], 'motherNetProfit') reven_df['ticker'] = ticker except: continue profitTTM_ls.append(reven_df) self.profitTTM = pd.concat(profitTTM_ls) # 取“OtherData”中总市值数据 # Tushare的市值数据只有17年6月->now df = market_value(self.data_source + '\\other\\otherdata.csv', self.tickers) self.mkt_value = df.drop(['price', 'totals'], axis=1)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 101, 103]] bs['CurrentRatio'] = bs[101] / bs[103] self.bs = bs.drop([101, 103], axis=1)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 101, 52, 139, 88, 103]] # 速动资产=流动资产103-存货52=流动资产103-存货52-预付账款139-待摊费用88 # 流动负债101 bs['Quick'] = (bs[103] - bs[88] - bs[52] - bs[139]) / bs[101] self.balance_sheet = bs.drop([101, 52, 139, 88, 103], axis=1)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) # EBIT / 利息费用,其中 EBIT=利润总额34+净利息费用 # 净利息费用=利息支出-利息收入,若未披露财务费用附注,则直接取财务费用值56 inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 34, 56]] self.inst = inst[(inst[56] > 1) | (inst[56] < -1)].copy() self.inst['interscover'] = (self.inst[34] + self.inst[56]) / self.inst[56] self.inst.sort_index(ascending=True, inplace=True)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 700) # totalAssets 121 bs = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 121]] bs['release_date'] = bs.index bs['report_date'] = bs.index bs['totalAssets'] = bs[121] bs.drop(121, axis=1, inplace=True) # revenue 0, cost 4 inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 0, 4]] inst['release_date'] = inst.index inst['report_date'] = inst.index inst['revenue'] = inst[0] inst.drop(0, axis=1, inplace=True) revenueTTM_ls = [] totalAssetsTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(inst[inst['ticker'] == ticker], 'revenue') reven_df['ticker'] = ticker except: print(ticker + ': revenue error') continue revenueTTM_ls.append(reven_df) for ticker in bs['ticker'].unique(): try: total_asset_df = ttmDiscrete(bs[bs['ticker'] == ticker], 'totalAssets') total_asset_df['ticker'] = ticker except: print(ticker + ': total asset error') continue totalAssetsTTM_ls.append(total_asset_df) self.revenueTTM = pd.concat(revenueTTM_ls) self.totalAssetsTTM = pd.concat(totalAssetsTTM_ls)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ # 多取一些数据做填充 shifted_begin_date = shift_date(begin_date, self.factor_param['lagTradeDays']) # 获取股票行情 hq = cp.concat_stock(self.data_source, self.tickers).loc[shifted_begin_date:end_date, ['code', 'close']] self.hq = cp.hconcat_stock_series(hq, self.tickers) # 获取指数Benchmark # b = sp.get_index(self.benchmark).loc[shifted_begin_date:end_date,['close']] b = pd.read_csv(self.data_source + '\\hq\\' + self.benchmark + '.csv', index_col=0).loc[shifted_begin_date:end_date, ['close']] self.b = b.fillna(method='ffill')
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ # 多取一些数据做填充 shifted_begin_date = shift_date(begin_date, self.factor_param['lagTradeDays']) # 获取股票行情 hq = cp.concat_stock(self.data_source, self.tickers).loc[shifted_begin_date:end_date, ['code', 'close']] self.hq = cp.hconcat_stock_series(hq, self.tickers) # 获取指数Benchmark # b = sp.get_index(self.benchmark).loc[shifted_begin_date:end_date,['close']] b = pd.read_csv(self.data_source + '\\hq\\' + self.benchmark + '.csv', index_col=0).loc[shifted_begin_date:end_date, ['close']] self.b = b.fillna(method='ffill') # 获取财务数据 # 按账面价值比 1/(1+负债总额/股东权益) # Dbequrt: Debt to Equity Ratio 产权比率=负债总额/股东权益*100% shifted_begin_date = shift_date(begin_date, 500) # 117负债, 121资产 Dbequrt_df = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 117, 121]] Dbequrt_df['totalLiabilities'] = Dbequrt_df[121] Dbequrt_df['totalEquity'] = Dbequrt_df[117] Dbequrt_df['Dbequrt'] = Dbequrt_df['totalLiabilities'] / Dbequrt_df[ 'totalEquity'] Dbequrt_df.drop([117, 121], axis=1, inplace=True) Dbequrt_df = Dbequrt_df[Dbequrt_df['Dbequrt']:0] Dbequrt_df['report_date'] = Dbequrt_df.index Dbequrt_df['release_date'] = Dbequrt_df.index self.Dbequrt_df = Dbequrt_df.drop(['totalLiabilities', 'totalEquity'], axis=1)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) # motherNetProfit 40 inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 40]] inst['release_date'] = inst.index inst['report_date'] = inst.index # cash_flows_yield 133 cf = cp.concat_fund(self.data_source, self.tickers, 'CF').loc[shifted_begin_date:end_date, ['ticker', 133]] cf['release_date'] = cf.index cf['report_date'] = cf.index self.accrual_df = cf.merge( inst, on=['ticker', 'release_date', 'report_date']) self.accrual_df['accr'] = self.accrual_df[40] - self.accrual_df[133] cash_flow_ls = [] for ticker in self.accrual_df['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues( self.accrual_df[self.accrual_df['ticker'] == ticker], 'accr') reven_df['ticker'] = ticker except: continue cash_flow_ls.append(reven_df) self.accrual_ttm = pd.concat(cash_flow_ls) # 总市值 # Tushare的市值数据只有17年-now df = market_value(self.data_source + '\\other\\otherdata.csv', self.tickers) self.mkt_value = df.drop(['price', 'totals'], axis=1)
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 500) earings_df = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 40]] earings_df['motherNetProfit'] = earings_df[40] earings_df.drop(40, axis=1, inplace=True) earings_df['reportDate'] = earings_df.index earings_df['reportDate'] = earings_df['reportDate'].apply( lambda x: x.strftime("%Y-%m-%d")) # 仅仅取年报, 查找是否reportDate是否以12月31日结尾 self.earings_df = earings_df[earings_df['reportDate'].str.endswith( '12-31')] # Tushare的市值数据只有17年-now df = market_value(self.data_source + '\\other\\otherdata.csv', self.tickers) self.mkt_value = df.drop(['price', 'totals'], axis=1)
def prepare_data(self, begin_date, end_date): """ 数据预处理 """ # 获取财务数据: # 资产负债比 = 总资产 / 公司债务总额 # TA2TL = totalAssets / totalLiabilities # 117负债, 121资产 shifted_begin_date = shift_date(begin_date, 500) ff = cp.concat_fund(self.data_source, self.tickers, 'BS').loc[shifted_begin_date:end_date, ['ticker', 117, 121]] # 这里以report date假定为announce date ff['reportDate'] = ff.index # 取出负债,资产总额数据 ff['TA2TL'] = ff[117] / ff[121] ff.drop([117, 121], axis=1, inplace=True) self.df = ff
def prepare_data(self, begin_date, end_date): shifted_begin_date = shift_date(begin_date, 700) inst = cp.concat_fund(self.data_source, self.tickers, 'IS').loc[shifted_begin_date:end_date, ['ticker', 0, 4]] inst['release_date'] = inst.index inst['report_date'] = inst.index inst['revenue'] = inst[0] inst['cost'] = inst[4] inst.drop([0, 4], axis=1, inplace=True) revenueTTM_ls = [] for ticker in inst['ticker'].unique(): try: # 财务数据不足4条会有异常 reven_df = ttmContinues(inst[inst['ticker'] == ticker], 'revenue,cost') reven_df['ticker'] = ticker except: print(ticker + ': revenue and cost error') continue revenueTTM_ls.append(reven_df) self.revenue_cost_TTM = pd.concat(revenueTTM_ls)
if __name__ == '__main__': start = time.time() import os from factorset.data.OtherData import code_to_symbol from factorset.data import CSVParser as cp import tushare as ts # allAshare = pd.read_csv(os.path.abspath('./allAShare.csv')) # allAshare = allAshare['0'] hs300 = ts.get_hs300s() hs300.code = hs300.code.apply(code_to_symbol) # 爬取沪深300还未存入的数据 Ashare = list( set(hs300.code.tolist()) - set(cp.all_fund_symbol(os.path.abspath('.'), 'IS'))) # BS表内时间有重复 # Ashare = ['300671.SZ', '002886.SZ', '300696.SZ', '603055.SH', '300670.SZ', '300692.SZ', # '002889.SZ', '603882.SH', '603801.SH', '603938.SH', '300687.SZ', '603535.SH', '603043.SH'] # BS时间有重复且值不相同(招股说明与申报稿) # Ashare = ['002886.SZ', '300696.SZ', '603938.SH', '300692.SZ', '300670.SZ', '603882.SH'] # IS时间有重复且值不相同(招股说明与申报稿) # Ashare = ['002886.SZ', '300696.SZ', '300670.SZ', '300692.SZ', '603055.SH', '603938.SH', '603882.SH'] # CF时间有重复且值不相同(招股说明与申报稿) # Ashare = ['002386.SZ', '603882.SH', '603018.SH', '300671.SZ', '603938.SH', '300537.SZ', '300670.SZ' , # '002086.SZ', '000568.SZ', '600612.SH', '300696.SZ', '600552.SH', '300687.SZ', '600983.SH', '002889.SZ', # '603801.SH', '300692.SZ', '603055.SH', '002886.SZ', '002852.SZ', '603505.SH', '300365.SZ', '603535.SH', # '300214.SZ', '300135.SZ', '603043.SH'] FundCrawler('BS').main(Ashare, num=20) print(time.time() - start)