def test_groupby(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) repr(grouped.groups) # nothing else here for k, v in grouped: self.assertEqual(len(v), 3) agged = grouped.aggregate(np.mean) self.assertEqual(agged[1], 1) assert_series_equal(agged, grouped.agg(np.mean)) # shorthand transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) value_grouped = data.groupby(data) assert_series_equal(value_grouped.aggregate(np.mean), agged) # complex agg agged = grouped.aggregate([np.mean, np.std]) agged = grouped.aggregate({'one': np.mean, 'two': np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.groupName] + x.mean()) self.assertEqual(agged[1], 21) # corner cases self.assertRaises(Exception, grouped._aggregate_named, lambda x: x * 2)
def test_groupby(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) repr(grouped.groups) # nothing else here for k, v in grouped: self.assertEqual(len(v), 3) agged = grouped.aggregate(np.mean) self.assertEqual(agged[1], 1) assert_series_equal(agged, grouped.agg(np.mean)) # shorthand assert_series_equal(agged, grouped.mean()) assert_series_equal(grouped.agg(np.sum), grouped.sum()) transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) value_grouped = data.groupby(data) assert_series_equal(value_grouped.aggregate(np.mean), agged) # complex agg agged = grouped.aggregate([np.mean, np.std]) agged = grouped.aggregate({'one' : np.mean, 'two' : np.std}) group_constants = { 0 : 10, 1 : 20, 2 : 30 } agged = grouped.agg(lambda x: group_constants[x.groupName] + x.mean()) self.assertEqual(agged[1], 21) # corner cases self.assertRaises(Exception, grouped._aggregate_named, lambda x: x * 2)
def test_groupby_transform(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) # corner cases self.assertRaises(Exception, grouped.transform, lambda x: x.mean())
def test_groupby_transform(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) transformed = grouped.transform(np.mean) for name, group in grouped: mean = group.mean() for idx in group.index: self.assertEqual(transformed[idx], mean)
class IndicatorAnalyst(object): """ 指标的统计分析类: 1)原始数据情况分析 2)添加不同指标后,原始数据被分成了长度为window的数据集,一种是不重复地分组,另一组是移动分组 3)对组内的数据进行描述统计分析和整个品种的描述统计分析 """ def __init__(self, data_set, indicator=None): """ Args: data_set: dict(symbol=DataFrame)或DataFrame, 待分析的数据集是一个以品种名为key,value是DataFrame或者是一个DataFrame indicator: Series,指标序列,默认是一个空的Series,可以通过直接设置indicator属性设置,或者在类内编写指标获得 Notes: 数据集的长度应当与indicator长度相同,否则会报错 """ self.__identify = None # 识别标签函数对象,目前主要是_group_identify 和 _rolling_identify self.__indicator = None # 当前处理的指标对象 self.__data = None # 当前处理的数据集对象 self.__group = None # 当前处理的分组对象 self.__symbol = None # 当前品种对象 self.__profit = None # 当前品种的盈亏序列 self._data_set = data_set.copy() # 总体数据集 self._indicator = Series() if indicator is None else indicator self._ind_len = 0 # 当前处理的指标数据行数 self._group = None @property def data_set(self): return self._data_set @property def group(self): """ 按照条件后分组的对象集合,若输入的数据集是dict,则返回dict,若是DataFrame则返回DataFrame """ return self._group @property def indicator(self): """指标序列""" return self._indicator @indicator.setter def indicator(self, ind): """设置指标序列""" self._indicator = ind def interval_analyst( self, condition, symbol, window=200, rolling=False, profit_mode=True, direction=1, group_plot=False, applied_price="open", fig_save_path=None, ): """ 分析指标满足条件下,在之后的窗口内价格的统计信息 Args: condition: [func]返回, True或False的函数对象; symbol:[dict or Symbol], 统计的品种对象; window:[int, -1, default 200] 观察窗口的大小,默认是200个bar,当rolling为True时,window取-1表示将满足条件开始的 点直到最后一个数据归为一组;当rolling为False,window取-1时,表示每一次满足条件区间内的数据分为一组,这种情况下 每一组的长度不相等。 rolling: [True, False],窗口是采用滚动模式还是截断分组,默认是每组数据重叠的截断分组; profit_mode: [True, False],计算盈利模式 direction: [1, -1],计算盈利时多空的方向 group_plot: [bool, default False], 绘制每一组数据的价格,当组数很大时将会绘制的很密集 applied_price: ["open", "low", "high", "close", default "open"],分析采用的价格 fig_save_path: [list, str, path] ,保存图片的路径,默认存储在 Returns: """ if isinstance(self._data_set, DataFrame): self.__data = self._data_set key = self.__data["symbol"].iat[0] # print('symbol: {};\n key: {}'.format(self.__data, key)) self.__indicator = self._indicator print(u"{}的{}指标描述性统计:".format(self.__data.iat[0, 5], self.__indicator.name)) print(self.__indicator.describe()) self.__group = self._interval_analyst(condition, window, rolling) self.__symbol = symbol self.group_analyst(profit_mode, direction=direction, fig_save_path=self.check_fig_path( fig_save_path, key), group_plot=group_plot, applied_price=applied_price) self._group = self.__group def _interval_analyst(self, condition, window, rolling): """ 分析指标满足条件下,在之后的窗口内价格的统计信息 Args: condition: 返回True或False的函数对象 window: 观察窗口的大小,默认是200个bar rolling: 窗口是采用滚动模式还是截断分组,默认是每组数据重叠的截断分组 """ if rolling: self.__identify = self._roll_identify else: self.__identify = self._group_identify if isinstance(self.__indicator, Series): self._ind_len = len(self.__indicator) assert self._ind_len == len(self.__data), u"指标的长度应当与数据集长度相同" else: raise ValueError(u"指标类型输入错误!") return self.__identify(condition, window=window) def _roll_identify(self, condition, window): """将满足条件的行及随后的window个数据识别成一类,并将其下标存储在groups中""" groups = {} count = 0 on_state = False for i, ind in enumerate(self.__indicator): if on_state: if condition(ind): continue else: on_state = False if condition(ind): count += 1 on_state = True # 当窗口为无限长,既到数据末尾 if window == -1: groups[count] = np.arange(i, self._ind_len) else: if i + window < self._ind_len: groups[count] = np.arange(i, i + window) else: groups[count] = np.arange(i, self._ind_len) return groups def _group_identify(self, condition, window, less_drop_num=10): """ 识别满足指标条件的行,并按照1到n的标志分组,原始数据添加一列name为指标name的标志数据,当window=-1时,为取满足 区间内的数据分为一组,且组内数据小于less_drop_num的剔除 Args: condition: 指标的条件 window: 窗口的大小 less_drop_num: 窗口内数据太小需要剔除的临界值 Returns: """ flag_list = [np.nan] * self._ind_len flag = 0 count = 0 last_position = 0 on_state = False if isinstance(self.__data, DataFrame): for i, _ind in enumerate(self.__indicator.values): if on_state: # 当窗口为符合条件的区间时 if window == -1: if not condition(_ind): # 当这组数据小于less_drop_num,不考虑这个样本 if i - last_position < less_drop_num: flag_list[last_position:i] = [np.nan] * ( i - last_position) flag -= 1 on_state = False else: flag_list[i] = flag else: if count < window: count += 1 else: count = 0 on_state = False continue if condition(_ind): on_state = True flag += 1 # 当窗口为符合条件的区间时 if window == -1: last_position = i flag_list[i] = flag else: if (i + window) < self._ind_len: flag_list[i:(i + window)] = [flag] * window else: raise ValueError("数据集的结构必须是DataFrame") self.__data.loc[:, self.__indicator.name] = self.__indicator self.__data.loc[:, "group_flag"] = flag_list return self.__data.groupby("group_flag", as_index=False) def group_analyst(self, profit_mode, direction=1, fig_save_path=None, group_plot=False, applied_price="open"): """ 数据分组分析,默认分析的是开盘价 Args: profit_mode: [bool, default False],分析盈亏或价格 direction: [1, -1],计算盈利时多空的方向 fig_save_path:[list, str, path] ,保存图片的路径 group_plot: [bool, default False], 绘制每一组数据的价格,当组数很大时将会绘制的很密集 applied_price: ["open", "low", "high", "close", default "open"],分析采用的价格 """ # 开始分析绘制图表 print(u"划分的区间数为{}".format(len(self.__group))) group_analyst = None fig, axe = plt.subplots(2, 2) g_fig, g_axe = None, None if group_plot: g_fig, g_axe = plt.subplots() fig3 = None if isinstance(self.__group, DataFrameGroupBy): group_analyst = self._frame_group_analyst(profit_mode, direction, group_plot, applied_price, g_axe) fig3, fig4 = self.group_density() elif isinstance(self.__group, dict): group_analyst = self._dict_group_analyst(profit_mode, direction, group_plot, applied_price, g_axe) fig3, fig4 = self.group_density() group_analyst["max"].plot.hist(ax=axe[(0, 0)], title=u"最大值分布", bins=60, legend=False) group_analyst["min"].plot.hist(ax=axe[(0, 1)], title=u"最小值分布", bins=60, legend=False) group_analyst["mean"].plot.hist(ax=axe[(1, 0)], title=u"平均值分布", bins=60, legend=False) group_analyst["std"].plot.hist(ax=axe[(1, 1)], title=u"标准差分布", bins=60, legend=False) if profit_mode: fig1, axe1 = plt.subplots(3, 2) group_analyst["max"].cumsum().plot(ax=axe1[0, 0], title=u"潜在的最大盈利变动", legend=False) group_analyst["max"].plot(ax=axe1[0, 1], title=u"每个样本的最大盈利", legend=False) group_analyst["min"].cumsum().plot(ax=axe1[1, 0], title=u"潜在的最大亏损变动", legend=False) group_analyst["min"].plot(ax=axe1[1, 1], title=u"每个样本的最大亏损", legend=False) group_analyst["mean"].cumsum().plot(ax=axe1[2, 0], title=u"潜在的平均盈亏变动", legend=False) group_analyst["mean"].plot(ax=axe1[2, 1], title=u"每个样本的平均盈亏", legend=False) fig1.savefig(os.path.join(fig_save_path, u"潜在盈亏分析图.png")) fig2, axe2 = plt.subplots(2) print(u"达到最大值的所需分钟数的描述统计") print(group_analyst["max_arg"].describe()) group_analyst["max_arg"].plot.hist(ax=axe2[0], title=u"达到最大值的所需时间(minute)的分布", bins=60, legend=False) print(u"达到最小值的所需分钟数的描述统计") print(group_analyst["min_arg"].describe()) group_analyst["min_arg"].plot.hist(ax=axe2[1], title=u"达到最小值的所需时间(minute)的分布", bins=60, legend=False) self.save_figure(fig_obj=[fig, fig2, fig3, fig4], save_path=fig_save_path, fig_name=[ u"每一组数据的统计分布.png", u"达到极值所需时间分布.png", u"概率分布随时间的演化.png", u"统计特征随时间的演化" ]) if g_fig is not None: g_fig.savefig(os.path.join(fig_save_path, u"窗口盈亏变动图.png")) plt.show() def _group_apply_func(self, x, _direction=1, my_func=None, arg_func=None, _profit_mode=True, apply_price="open", in_position=1, symbol=None): """ DataFrameGroupBy的具体的apply函数 Args: x: [Series],每一组数据 _direction: [1, -1],方向 my_func: [func],Series自带一些统计函数 arg_func: [func],numpy中的函数 _profit_mode: [bool, default True],选择分析盈亏还是价格 apply_price: ["open", "low", "high", "close", default "open"],分析采用的价格 in_position: [int],计算盈亏时,进场点的位置 symbol: [Symbol], 品种对象 Returns: 返回一个Series """ assert _direction in (1, -1), u"direction只能取1和-1" assert len(x) > in_position, u"每组的长度不能为{}".format(in_position) # 分析的数据的选择 if _profit_mode: group_data = self._future_profit(x, symbol, _direction, apply_price, in_position) else: group_data = x[apply_price] if arg_func is None and my_func is not None: return Series(my_func(group_data)) elif arg_func is not None: # print('arg_func: {};\n group_data: {};\nx:{}\n in_position: {}'.format(arg_func,group_data,x,in_position)) # print((arg_func(group_data) - x[apply_price].index[in_position])) return Series((arg_func(group_data) - x[apply_price].index[in_position]).seconds / 60) else: return group_data @staticmethod def _future_profit(x, symbol, _direction, apply_price, in_position): """期货盈亏计算""" open_cost, close_cost = 0.0, [] if symbol.open_cost_rate != 0.0: # print("x: {};\n apply_price: {};\n in_position: {}".format(x, apply_price, in_position)) open_cost = symbol.open_cost_rate * x[apply_price].iat[ in_position] * symbol.size_value if symbol.close_cost_rate != 0.0: close_cost = [ symbol.close_cost_rate * price * symbol.size_value for price in x[apply_price] ] return symbol.size_value * ( _direction * (x[apply_price] - x[apply_price].iat[in_position]) - symbol.slippage) - close_cost - open_cost def _frame_group_analyst(self, _profit_mode, _direction, _group_plot, _applied_price, _axe): """使用DataFrameGroupBy类的分组分析""" if _profit_mode: max_plot = self.__group.apply(self._group_apply_func, symbol=self.__symbol, my_func=np.max, _direction=_direction, apply_price=_applied_price) max_arg = self.__group.apply(self._group_apply_func, symbol=self.__symbol, arg_func=np.argmax, _direction=_direction, apply_price=_applied_price) min_plot = self.__group.apply(self._group_apply_func, symbol=self.__symbol, my_func=np.min, _direction=_direction, apply_price=_applied_price) min_arg = self.__group.apply(self._group_apply_func, symbol=self.__symbol, arg_func=np.argmin, _direction=_direction, apply_price=_applied_price) mean_plot = self.__group.apply(self._group_apply_func, symbol=self.__symbol, my_func=np.mean, _direction=_direction, apply_price=_applied_price) std_plot = self.__group.apply(self._group_apply_func, symbol=self.__symbol, my_func=np.std, _direction=_direction, apply_price=_applied_price) # 每组数据绘制图片 if _group_plot: self.__profit = self.__group.apply(self._group_apply_func, symbol=self.__symbol, _direction=_direction, apply_price=_applied_price) for g in self.__profit.index.levels[0]: _axe.plot(self.__profit[g].values) else: max_plot = self.__group.max()[_applied_price] max_arg = self.__group.apply(self._group_apply_func, arg_func=np.argmax, _profit_mode=False, _direction=_direction, apply_price=_applied_price) min_plot = self.__group.min()[_applied_price] min_arg = self.__group.apply(self._group_apply_func, arg_func=np.argmin, _profit_mode=False, _direction=_direction, apply_price=_applied_price) mean_plot = self.__group.mean()[_applied_price] std_plot = self.__group.std()[_applied_price] return pd.concat( [max_plot, max_arg, min_plot, min_arg, mean_plot, std_plot], axis=1, keys=["max", "max_arg", "min", "min_arg", "mean", "std"]) def _dict_group_analyst(self, _profit_mode, _direction, _group_plot, _applied_price, _axe): """字典形式的分组分析""" max_list, max_arg_list, min_list, min_arg_list, mean_list, std_list = [], [], [], [], [], [] index_map = {"open": 0, "high": 1, "low": 2, "close": 3} index = index_map[_applied_price] in_position = 1 profit_list = [] top_index = [] bottom_index = [] for key in self.__group: data_ = self.__data.iloc[self.__group[key], index] if _profit_mode: profit = self._future_profit(data_, self.__symbol, _direction, _applied_price, in_position) profit_list.extend(profit.values) top_index.extend([key] * len(profit.index)) bottom_index.extend(profit.index.values) max_list.append(profit.max()) max_arg_list.append( (profit.argmax() - profit.index[in_position]).total_seconds() / 60) min_list.append(profit.min()) min_arg_list.append( (profit.argmin() - profit.index[in_position]).total_seconds() / 60) mean_list.append(profit.mean()) std_list.append(profit.std()) if _group_plot: _axe.plot(profit.values) else: max_list.append(data_.max()) max_arg_list.append( (data_.argmax() - data_.index[in_position]).total_seconds() / 60) min_list.append(data_.min()) min_arg_list.append( (data_.argmax() - data_.index[in_position]).total_seconds() / 60) mean_list.append(data_.mean()) std_list.append(data_.std()) index = pd.MultiIndex.from_arrays([top_index, bottom_index], names=[None, 'date']) self.__profit = Series(profit_list, index=index) return DataFrame({ "max": max_list, "max_arg": max_arg_list, "min": min_list, "min_arg": min_arg_list, "mean": mean_list, "std": std_list }) def group_density(self, bin_num=40, window=200, plot_surface=True): """绘制每组数据的概率密度随时间的变化图""" fig = plt.figure() ax = fig.add_subplot(111, projection='3d') fig_1, ax_1 = plt.subplots(3) max_profit = self.__profit.max() min_profit = self.__profit.min() g_profit = self.__profit.groupby(level=0) max_list, min_list, mean_list = [], [], [] xs = [] ys = [] zs = [] for i in range(window): max_list.append(g_profit.nth(i).max()) min_list.append(g_profit.nth(i).min()) mean_list.append(g_profit.nth(i).mean()) hist, bins = np.histogram(g_profit.nth(i).values, bins=np.linspace(min_profit, max_profit, bin_num), density=True) xs.append(bins[:-1]) ys.append(i * np.ones(bin_num - 1)) zs.append(hist * np.diff(bins)) if not plot_surface: ax.plot(xs[-1], ys[-1], zs=zs[-1]) ax_1[0].plot(max_list) ax_1[0].set_title(u"最大值随时间的演化") ax_1[1].plot(min_list) ax_1[1].set_title(u"最小值随时间的演化") ax_1[2].plot(mean_list) ax_1[2].set_title(u"平均值随时间的演化") if plot_surface: surf = ax.plot_surface(xs, ys, zs, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) ax.set_zlim(0, 1) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter('% .02f')) fig.colorbar(surf, shrink=0.5, aspect=5) return fig, fig_1 def save_group_data(self, file_path, file_patch): """保存分组数据""" for k in self._data_set: self._data_set[k].to_csv( os.path.join(file_path, k.lower() + file_patch)) @staticmethod def save_figure(fig_obj, save_path, fig_name): """保存图片""" for f, n in zip(fig_obj, fig_name): f.savefig(os.path.join(save_path, n)) @staticmethod def check_fig_path(path, dir_name): """检查存储路径是否合法""" if path is None: print('dir_name: ', dir_name) path = os.path.join(os.getcwd(), "analyst_result", dir_name) else: path = os.path.join(path, "analyst_result", dir_name) if not os.path.exists(path): os.makedirs(path) return path