def save_factor_data(self, factor_name, data_source=None): if data_source is None: data_source = ['hdf5', 'mongo'] if 'hdf5' in data_source: # 保存预处理之后的数据到本地hdf5,单因子检测使用 path = LocalDataPath.path + LocalDataFolderName.FACTOR.value + '/' save_data_to_hdf5(path, factor_name, self.raw_data) if 'mongo' in data_source: # 保存预处理之后的数据到mongo with MongoConnect(DatabaseName.MULTI_FACTOR_DATA.value): doc_list = [] raw_data = self.raw_data.rename(columns={ i: code_market_to_market_code(i) for i in extreme_data.columns }) for index, row in raw_data.iterrows(): doc = FactorPreProcessingData(factor_name=factor_name, time_tag=index, factor_data=row) doc_list.append(doc) if len(doc_list) > 999: FactorPreProcessingData.objects.insert(doc_list) doc_list = [] else: FactorPreProcessingData.objects.insert(doc_list)
def update_index_class(self, industry_class_name, industry_class_dict): with MongoConnect(self.database): index_members_data = AShareIndexMembers.objects( index_code__in=industry_class_dict.keys()).as_pymongo() field_list = ['index_code', 'security_code', 'in_date', 'out_date'] self.index_members_df = pd.DataFrame( list(index_members_data)).reindex(columns=field_list) self.index_members_df = self.index_members_df.fillna( datetime.now()).reset_index(drop=True) get_collection_list = GetCollectionList() a_share_list = get_collection_list.get_a_share_list() calendar_obj = GetCalendar() calendar_SH = calendar_obj.get_calendar('SH') self.index_class = pd.DataFrame(columns=a_share_list, index=calendar_SH) def industry_history(x, index_members_df): industry_in_out_date = index_members_df[ index_members_df.security_code == x.name] for index, row in industry_in_out_date.iterrows(): x[row['in_date']:row['out_date']] = row['index_code'] return x self.index_class = self.index_class.apply( industry_history, args=(self.index_members_df, ), axis=0) self.index_class = self.index_class.fillna(method='pad').fillna( method='backfill') folder_name = LocalDataFolderName.INDUSTRY_CLASS.value path = LocalDataPath.path + folder_name + '/' data_name = industry_class_name save_data_to_hdf5(path, data_name, self.index_class)
def update_index_data(self, end=datetime.now()): """ :param end: :return: """ get_collection_list = GetCollectionList() index_list = get_collection_list.get_index_list() self.end = end database = DatabaseName.INDEX_KLINE_DAILY.value with MongoConnect(database): index_data_dict = {} for index_code in index_list: with switch_collection(Kline, index_code) as KlineDaily_index_code: security_code_data = KlineDaily_index_code.objects(time_tag__lte=self.end).as_pymongo() security_code_data_df = pd.DataFrame(list(security_code_data)).reindex(columns=self.field) security_code_data_df.set_index(["time_tag"], inplace=True) index_data_dict[index_code] = security_code_data_df field_data_dict = {} for i in self.field: if i != 'time_tag': field_data_pd = pd.DataFrame({key: value[i] for key, value in index_data_dict.items()}) # 原始数据的开高低收除以10000 if i in ['open', 'high', 'low', 'close']: field_data_dict[i] = field_data_pd.div(10000) else: field_data_dict[i] = field_data_pd folder_name = LocalDataFolderName.MARKET_DATA.value sub_folder_name = LocalDataFolderName.KLINE_DAILY.value sub_sub_folder_name = LocalDataFolderName.INDEX.value for field in self.field: if field not in ['time_tag', 'interest']: path = LocalDataPath.path + folder_name + '/' + sub_folder_name + '/' + sub_sub_folder_name + '/' data_name = field save_data_to_hdf5(path, data_name, pd.DataFrame(field_data_dict[field]))
def update_index_members(self): with MongoConnect(self.database): index_members_data = AShareIndexMembers.objects().as_pymongo() field_list = ['index_code', 'security_code', 'in_date', 'out_date'] self.index_members_df = pd.DataFrame( list(index_members_data)).reindex(columns=field_list) folder_name = LocalDataFolderName.INDEX_MEMBER.value path = LocalDataPath.path + folder_name + '/' data_name = folder_name save_data_to_hdf5(path, data_name, self.index_members_df)
def save_a_share_adj_factor_right(self): """ 取当日收盘价,作为转、送的股价, 再计算复权因子更新到AShareExRightDividend, 复权因子adj_factor 比例 = 送股比例 + 转增比例 + 缩减比例 单次复权因子 = 股权登记日收盘价 * (1 + 比例 + 配股比例 + 增发比例) / (股权登记日收盘价 - 派息比例 + 股权登记日收盘价 * 比例 + 配股价格 * 配股比例 + 增发价格 * 增发比例) :return: """ kline_object = GetKlineData() all_market_data = kline_object.cache_all_stock_data() with MongoConnect(self.database): self.data = pd.DataFrame( AShareExRightDividend.objects.as_pymongo()) self.data['close'] = self.data.apply( lambda x: self.get_adj_day_close(x['security_code'], x[ 'ex_date'], all_market_data), axis=1) self.data = self.data.fillna(0) ratio = self.data['bonus_share_ratio'] + self.data[ 'conversed_ratio'] + self.data['consolidate_split_ratio'] self.data['adj_factor'] = self.data['close'] * ( 1 + ratio + self.data['rightsissue_ratio'] + self.data['seo_ratio'] ) / (self.data['close'] - self.data['cash_dividend_ratio'] + self.data['close'] * ratio + self.data['rightsissue_price'] * self.data['rightsissue_ratio'] + self.data['seo_price'] * self.data['seo_ratio']) folder_name = LocalDataFolderName.ADJ_FACTOR.value path = LocalDataPath.path + folder_name + '/' self.data = self.data.reindex( columns=['security_code', 'ex_date', 'adj_factor']) self.data.set_index(["ex_date"], inplace=True) self.data.sort_index(inplace=True) calendar_obj = GetCalendar() calendar = calendar_obj.get_calendar('SZ') backward_factor = pd.DataFrame(index=calendar) adj_factor = pd.DataFrame(index=calendar) data_dict = dict( list(self.data.groupby(self.data['security_code']))) for security_code, adj_data in data_dict.items(): backward_factor[security_code] = self.cal_backward_factor( adj_data['adj_factor']) adj_factor[security_code] = adj_data['adj_factor'] backward_factor.replace([np.inf, 0], np.nan, inplace=True) backward_factor.fillna(method='ffill', inplace=True) backward_factor.fillna(1, inplace=True) backward_factor = backward_factor.reindex( columns=all_market_data['close'].columns, fill_value=1) save_data_to_hdf5(path, AdjustmentFactor.BACKWARD_ADJ_FACTOR.value, backward_factor) save_data_to_hdf5(path, AdjustmentFactor.FROWARD_ADJ_FACTOR.value, backward_factor.div(backward_factor.iloc[-1]))
def update_calendar_hdf5(self): with MongoConnect(self.database): data = AShareCalendar.objects().as_pymongo() data_df = pd.DataFrame(data) data_df.set_index('market', inplace=True) data_df = data_df.drop(['_id', 'update_date'], axis=1) folder_name = LocalDataFolderName.CALENDAR.value for index, row in data_df.iterrows(): path = LocalDataPath.path + folder_name + '/' data_name = folder_name + '_' + str(index) save_data_to_hdf5(path, data_name, pd.DataFrame(data_df.loc[index, 'trade_days']))
def update_all_market_data(self, end=datetime.now()): get_collection_list = GetCollectionList() a_share_list = get_collection_list.get_a_share_list() a_share_list = [i for i in a_share_list if is_security_type(i, 'EXTRA_STOCK_A')] all_market_data = self.get_all_market_data(security_list=a_share_list, end=end) folder_name = LocalDataFolderName.MARKET_DATA.value sub_folder_name = LocalDataFolderName.KLINE_DAILY.value sub_sub_folder_name = LocalDataFolderName.A_SHARE.value for field in self.field: if field not in ['time_tag', 'interest']: path = LocalDataPath.path + folder_name + '/' + sub_folder_name + '/' + sub_sub_folder_name + '/' data_name = field save_data_to_hdf5(path, data_name, pd.DataFrame(all_market_data[field]))
def update_a_share_capitalization(self): """ 保存 总股本,总市值, 流通股本,流通市值 四个hdf5 :return: """ with MongoConnect(self.database): a_share_capitalization = AShareCapitalization.objects().as_pymongo( ) field_list = [ 'security_code', 'change_date', 'total_share', 'float_share', 'float_a_share', 'float_b_share', 'float_h_share' ] self.a_share_capitalization = pd.DataFrame( list(a_share_capitalization)).reindex(columns=field_list) kline_object = GetKlineData() market_close_data = kline_object.cache_all_stock_data()['close'] index = list( set(market_close_data.index).union( set(self.a_share_capitalization['change_date']))) index.sort() share_capitalization_grouped = self.a_share_capitalization.groupby( 'security_code') total_share = pd.DataFrame({}) float_a_share = pd.DataFrame({}) for i in share_capitalization_grouped: data = i[1].sort_values('change_date').set_index('change_date') try: total_share[i[0]] = data['total_share'].reindex(index) float_a_share[i[0]] = data['float_a_share'].reindex(index) except ValueError: # 有四只票 change date 重复,需要手工清洗修正 # print(data[data.index.duplicated()]) total_share[i[0]] = data[ data.index.duplicated()]['total_share'].reindex(index) float_a_share[i[0]] = data[data.index.duplicated( )]['float_a_share'].reindex(index) total_share = total_share.fillna(method='ffill').reindex( market_close_data.index) float_a_share = float_a_share.fillna(method='ffill').reindex( market_close_data.index) total_share_value = total_share.multiply(10000) * market_close_data float_a_share_value = float_a_share.multiply( 10000) * market_close_data folder_name = LocalDataFolderName.INDICATOR_EVERYDAY.value path = LocalDataPath.path + folder_name + '/' save_data_to_hdf5(path, 'total_share', total_share) save_data_to_hdf5(path, 'float_a_share', float_a_share) save_data_to_hdf5(path, 'total_share_value', total_share_value) save_data_to_hdf5(path, 'float_a_share_value', float_a_share_value)
def update_a_sws_index(self): database = DatabaseName.STOCK_BASE_DATA.value with MongoConnect(database): a_sws_index = ASwsIndex.objects().as_pymongo() field_list = [ 'sw_index_code', 'time_tag', 'pre_close', 'open', 'high', 'low', 'close', 'volume', 'amount', 'index_pe', 'index_pb', 'index_free_float_market_capitalisation', 'index_total_market_capitalisation' ] self.a_sws_index_df = pd.DataFrame(a_sws_index).reindex( columns=field_list) self.a_sws_index_df[['pre_close', 'open', 'high', 'low', 'close']] = self.a_sws_index_df[[ 'pre_close', 'open', 'high', 'low', 'close' ]].div(10000) folder_name = LocalDataFolderName.SWS_INDEX.value path = LocalDataPath.path + folder_name + '/' data_name = folder_name save_data_to_hdf5(path, data_name, self.a_sws_index_df)
factor_pre_obj = FactorPreProcessing(indicator_data) # 可根据时间和股票list过滤数据 data_filter = factor_pre_obj.data_filter() # 去极值方法,四种 extreme_data = factor_pre_obj.extreme_processing( dict(std={'sigma_multiple': 3})) # extreme_data = factor_pre_obj.extreme_processing(dict(mad={'median_multiple': 1.483})) # extreme_data = factor_pre_obj.extreme_processing(dict(quantile={'quantile_min': 0.025, 'quantile_max': 0.975})) # extreme_data = factor_pre_obj.extreme_processing(dict(box_plot={'median_multiple': 3})) # 中性化方法,可选择行业和流通市值中性 neutralize_data = factor_pre_obj.neutralize_processing( dict(neutralize_method=[ NeutralizeMethod.INDUSTRY.value, NeutralizeMethod.MARKET_VALUE.value ])) # 归一化方法,三种 # scale_data = factor_pre_obj.scale_processing(ScaleMethod.MIN_MAX.value) scale_data = factor_pre_obj.scale_processing(ScaleMethod.Z_SCORE.value) # scale_data = factor_pre_obj.scale_processing(ScaleMethod.RANK.value) # 补充空值的方法,已实现两种 fill_nan_data = factor_pre_obj.fill_nan_processing( FillNanMethod.MEAN.value) # 保存预处理之后的因子数据,单因子检测使用 path = LocalDataPath.path + LocalDataFolderName.FACTOR.value + '/' save_data_to_hdf5(path, 'factor_ma10', fill_nan_data)