Beispiel #1
0
    def test_groupby(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        repr(grouped.groups)  # nothing else here

        for k, v in grouped:
            self.assertEqual(len(v), 3)

        agged = grouped.aggregate(np.mean)
        self.assertEqual(agged[1], 1)

        assert_series_equal(agged, grouped.agg(np.mean))  # shorthand

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        value_grouped = data.groupby(data)
        assert_series_equal(value_grouped.aggregate(np.mean), agged)

        # complex agg
        agged = grouped.aggregate([np.mean, np.std])
        agged = grouped.aggregate({'one': np.mean, 'two': np.std})

        group_constants = {0: 10, 1: 20, 2: 30}
        agged = grouped.agg(lambda x: group_constants[x.groupName] + x.mean())
        self.assertEqual(agged[1], 21)

        # corner cases
        self.assertRaises(Exception, grouped._aggregate_named, lambda x: x * 2)
Beispiel #2
0
    def test_groupby(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        repr(grouped.groups) # nothing else here

        for k, v in grouped:
            self.assertEqual(len(v), 3)

        agged = grouped.aggregate(np.mean)
        self.assertEqual(agged[1], 1)

        assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
        assert_series_equal(agged, grouped.mean())

        assert_series_equal(grouped.agg(np.sum), grouped.sum())


        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        value_grouped = data.groupby(data)
        assert_series_equal(value_grouped.aggregate(np.mean), agged)

        # complex agg
        agged = grouped.aggregate([np.mean, np.std])
        agged = grouped.aggregate({'one' : np.mean,
                                   'two' : np.std})

        group_constants = {
            0 : 10,
            1 : 20,
            2 : 30
        }
        agged = grouped.agg(lambda x: group_constants[x.groupName] + x.mean())
        self.assertEqual(agged[1], 21)

        # corner cases
        self.assertRaises(Exception, grouped._aggregate_named,
                          lambda x: x * 2)
Beispiel #3
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        # corner cases
        self.assertRaises(Exception, grouped.transform, lambda x: x.mean())
Beispiel #4
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        # corner cases
        self.assertRaises(Exception, grouped.transform,
                          lambda x: x.mean())
Beispiel #5
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        transformed = grouped.transform(np.mean)
        for name, group in grouped:
            mean = group.mean()
            for idx in group.index:
                self.assertEqual(transformed[idx], mean)
Beispiel #6
0
    def test_groupby_transform(self):
        data = Series(np.arange(9) / 3, index=np.arange(9))

        index = np.arange(9)
        np.random.shuffle(index)
        data = data.reindex(index)

        grouped = data.groupby(lambda x: x // 3)

        transformed = grouped.transform(lambda x: x * x.sum())
        self.assertEqual(transformed[7], 12)

        transformed = grouped.transform(np.mean)
        for name, group in grouped:
            mean = group.mean()
            for idx in group.index:
                self.assertEqual(transformed[idx], mean)
Beispiel #7
0
class IndicatorAnalyst(object):
    """
    指标的统计分析类:
    1)原始数据情况分析
    2)添加不同指标后,原始数据被分成了长度为window的数据集,一种是不重复地分组,另一组是移动分组
    3)对组内的数据进行描述统计分析和整个品种的描述统计分析
    """
    def __init__(self, data_set, indicator=None):
        """
        Args:
            data_set: dict(symbol=DataFrame)或DataFrame, 待分析的数据集是一个以品种名为key,value是DataFrame或者是一个DataFrame
            indicator: Series,指标序列,默认是一个空的Series,可以通过直接设置indicator属性设置,或者在类内编写指标获得
        Notes:
            数据集的长度应当与indicator长度相同,否则会报错
        """
        self.__identify = None  # 识别标签函数对象,目前主要是_group_identify 和 _rolling_identify
        self.__indicator = None  # 当前处理的指标对象
        self.__data = None  # 当前处理的数据集对象
        self.__group = None  # 当前处理的分组对象
        self.__symbol = None  # 当前品种对象
        self.__profit = None  # 当前品种的盈亏序列
        self._data_set = data_set.copy()  # 总体数据集
        self._indicator = Series() if indicator is None else indicator
        self._ind_len = 0  # 当前处理的指标数据行数
        self._group = None

    @property
    def data_set(self):
        return self._data_set

    @property
    def group(self):
        """
        按照条件后分组的对象集合,若输入的数据集是dict,则返回dict,若是DataFrame则返回DataFrame
        """
        return self._group

    @property
    def indicator(self):
        """指标序列"""
        return self._indicator

    @indicator.setter
    def indicator(self, ind):
        """设置指标序列"""
        self._indicator = ind

    def interval_analyst(
        self,
        condition,
        symbol,
        window=200,
        rolling=False,
        profit_mode=True,
        direction=1,
        group_plot=False,
        applied_price="open",
        fig_save_path=None,
    ):
        """
         分析指标满足条件下,在之后的窗口内价格的统计信息
        Args:
            condition: [func]返回, True或False的函数对象;
            symbol:[dict or Symbol], 统计的品种对象;
            window:[int, -1, default 200] 观察窗口的大小,默认是200个bar,当rolling为True时,window取-1表示将满足条件开始的
                点直到最后一个数据归为一组;当rolling为False,window取-1时,表示每一次满足条件区间内的数据分为一组,这种情况下
                每一组的长度不相等。
            rolling: [True, False],窗口是采用滚动模式还是截断分组,默认是每组数据重叠的截断分组;
            profit_mode: [True, False],计算盈利模式
            direction: [1, -1],计算盈利时多空的方向
            group_plot: [bool, default False], 绘制每一组数据的价格,当组数很大时将会绘制的很密集
            applied_price: ["open", "low", "high", "close", default "open"],分析采用的价格
            fig_save_path: [list, str, path] ,保存图片的路径,默认存储在
        Returns:

        """
        if isinstance(self._data_set, DataFrame):
            self.__data = self._data_set

            key = self.__data["symbol"].iat[0]
            # print('symbol: {};\n key: {}'.format(self.__data, key))
            self.__indicator = self._indicator
            print(u"{}的{}指标描述性统计:".format(self.__data.iat[0, 5],
                                          self.__indicator.name))
            print(self.__indicator.describe())
            self.__group = self._interval_analyst(condition, window, rolling)
            self.__symbol = symbol
            self.group_analyst(profit_mode,
                               direction=direction,
                               fig_save_path=self.check_fig_path(
                                   fig_save_path, key),
                               group_plot=group_plot,
                               applied_price=applied_price)
            self._group = self.__group

    def _interval_analyst(self, condition, window, rolling):
        """
        分析指标满足条件下,在之后的窗口内价格的统计信息
        Args:
            condition: 返回True或False的函数对象
            window: 观察窗口的大小,默认是200个bar
            rolling: 窗口是采用滚动模式还是截断分组,默认是每组数据重叠的截断分组
        """
        if rolling:
            self.__identify = self._roll_identify
        else:
            self.__identify = self._group_identify

        if isinstance(self.__indicator, Series):
            self._ind_len = len(self.__indicator)
            assert self._ind_len == len(self.__data), u"指标的长度应当与数据集长度相同"
        else:
            raise ValueError(u"指标类型输入错误!")
        return self.__identify(condition, window=window)

    def _roll_identify(self, condition, window):
        """将满足条件的行及随后的window个数据识别成一类,并将其下标存储在groups中"""
        groups = {}
        count = 0
        on_state = False
        for i, ind in enumerate(self.__indicator):
            if on_state:
                if condition(ind):
                    continue
                else:
                    on_state = False

            if condition(ind):
                count += 1
                on_state = True
                # 当窗口为无限长,既到数据末尾
                if window == -1:
                    groups[count] = np.arange(i, self._ind_len)
                else:
                    if i + window < self._ind_len:
                        groups[count] = np.arange(i, i + window)
                    else:
                        groups[count] = np.arange(i, self._ind_len)
        return groups

    def _group_identify(self, condition, window, less_drop_num=10):
        """
        识别满足指标条件的行,并按照1到n的标志分组,原始数据添加一列name为指标name的标志数据,当window=-1时,为取满足
        区间内的数据分为一组,且组内数据小于less_drop_num的剔除
        Args:
            condition: 指标的条件
            window: 窗口的大小
            less_drop_num: 窗口内数据太小需要剔除的临界值

        Returns:

        """
        flag_list = [np.nan] * self._ind_len
        flag = 0
        count = 0
        last_position = 0
        on_state = False
        if isinstance(self.__data, DataFrame):
            for i, _ind in enumerate(self.__indicator.values):
                if on_state:

                    # 当窗口为符合条件的区间时
                    if window == -1:
                        if not condition(_ind):
                            # 当这组数据小于less_drop_num,不考虑这个样本
                            if i - last_position < less_drop_num:
                                flag_list[last_position:i] = [np.nan] * (
                                    i - last_position)
                                flag -= 1
                            on_state = False
                        else:
                            flag_list[i] = flag
                    else:
                        if count < window:
                            count += 1
                        else:
                            count = 0
                            on_state = False
                    continue

                if condition(_ind):
                    on_state = True
                    flag += 1
                    # 当窗口为符合条件的区间时
                    if window == -1:
                        last_position = i
                        flag_list[i] = flag
                    else:
                        if (i + window) < self._ind_len:
                            flag_list[i:(i + window)] = [flag] * window

        else:
            raise ValueError("数据集的结构必须是DataFrame")
        self.__data.loc[:, self.__indicator.name] = self.__indicator
        self.__data.loc[:, "group_flag"] = flag_list
        return self.__data.groupby("group_flag", as_index=False)

    def group_analyst(self,
                      profit_mode,
                      direction=1,
                      fig_save_path=None,
                      group_plot=False,
                      applied_price="open"):
        """
        数据分组分析,默认分析的是开盘价
        Args:
            profit_mode: [bool, default False],分析盈亏或价格
            direction: [1, -1],计算盈利时多空的方向
            fig_save_path:[list, str, path] ,保存图片的路径
            group_plot: [bool, default False], 绘制每一组数据的价格,当组数很大时将会绘制的很密集
            applied_price: ["open", "low", "high", "close", default "open"],分析采用的价格
        """
        # 开始分析绘制图表
        print(u"划分的区间数为{}".format(len(self.__group)))
        group_analyst = None
        fig, axe = plt.subplots(2, 2)

        g_fig, g_axe = None, None
        if group_plot:
            g_fig, g_axe = plt.subplots()

        fig3 = None
        if isinstance(self.__group, DataFrameGroupBy):
            group_analyst = self._frame_group_analyst(profit_mode, direction,
                                                      group_plot,
                                                      applied_price, g_axe)
            fig3, fig4 = self.group_density()
        elif isinstance(self.__group, dict):
            group_analyst = self._dict_group_analyst(profit_mode, direction,
                                                     group_plot, applied_price,
                                                     g_axe)
            fig3, fig4 = self.group_density()
        group_analyst["max"].plot.hist(ax=axe[(0, 0)],
                                       title=u"最大值分布",
                                       bins=60,
                                       legend=False)
        group_analyst["min"].plot.hist(ax=axe[(0, 1)],
                                       title=u"最小值分布",
                                       bins=60,
                                       legend=False)
        group_analyst["mean"].plot.hist(ax=axe[(1, 0)],
                                        title=u"平均值分布",
                                        bins=60,
                                        legend=False)
        group_analyst["std"].plot.hist(ax=axe[(1, 1)],
                                       title=u"标准差分布",
                                       bins=60,
                                       legend=False)

        if profit_mode:
            fig1, axe1 = plt.subplots(3, 2)
            group_analyst["max"].cumsum().plot(ax=axe1[0, 0],
                                               title=u"潜在的最大盈利变动",
                                               legend=False)
            group_analyst["max"].plot(ax=axe1[0, 1],
                                      title=u"每个样本的最大盈利",
                                      legend=False)
            group_analyst["min"].cumsum().plot(ax=axe1[1, 0],
                                               title=u"潜在的最大亏损变动",
                                               legend=False)
            group_analyst["min"].plot(ax=axe1[1, 1],
                                      title=u"每个样本的最大亏损",
                                      legend=False)
            group_analyst["mean"].cumsum().plot(ax=axe1[2, 0],
                                                title=u"潜在的平均盈亏变动",
                                                legend=False)
            group_analyst["mean"].plot(ax=axe1[2, 1],
                                       title=u"每个样本的平均盈亏",
                                       legend=False)
            fig1.savefig(os.path.join(fig_save_path, u"潜在盈亏分析图.png"))

        fig2, axe2 = plt.subplots(2)
        print(u"达到最大值的所需分钟数的描述统计")
        print(group_analyst["max_arg"].describe())
        group_analyst["max_arg"].plot.hist(ax=axe2[0],
                                           title=u"达到最大值的所需时间(minute)的分布",
                                           bins=60,
                                           legend=False)
        print(u"达到最小值的所需分钟数的描述统计")
        print(group_analyst["min_arg"].describe())
        group_analyst["min_arg"].plot.hist(ax=axe2[1],
                                           title=u"达到最小值的所需时间(minute)的分布",
                                           bins=60,
                                           legend=False)

        self.save_figure(fig_obj=[fig, fig2, fig3, fig4],
                         save_path=fig_save_path,
                         fig_name=[
                             u"每一组数据的统计分布.png", u"达到极值所需时间分布.png",
                             u"概率分布随时间的演化.png", u"统计特征随时间的演化"
                         ])
        if g_fig is not None:
            g_fig.savefig(os.path.join(fig_save_path, u"窗口盈亏变动图.png"))
        plt.show()

    def _group_apply_func(self,
                          x,
                          _direction=1,
                          my_func=None,
                          arg_func=None,
                          _profit_mode=True,
                          apply_price="open",
                          in_position=1,
                          symbol=None):
        """
        DataFrameGroupBy的具体的apply函数
        Args:
            x: [Series],每一组数据
            _direction: [1, -1],方向
            my_func: [func],Series自带一些统计函数
            arg_func: [func],numpy中的函数
            _profit_mode: [bool, default True],选择分析盈亏还是价格
            apply_price: ["open", "low", "high", "close", default "open"],分析采用的价格
            in_position: [int],计算盈亏时,进场点的位置
            symbol: [Symbol], 品种对象
        Returns:
           返回一个Series
        """
        assert _direction in (1, -1), u"direction只能取1和-1"
        assert len(x) > in_position, u"每组的长度不能为{}".format(in_position)
        # 分析的数据的选择
        if _profit_mode:
            group_data = self._future_profit(x, symbol, _direction,
                                             apply_price, in_position)
        else:
            group_data = x[apply_price]

        if arg_func is None and my_func is not None:
            return Series(my_func(group_data))
        elif arg_func is not None:
            # print('arg_func: {};\n group_data: {};\nx:{}\n in_position: {}'.format(arg_func,group_data,x,in_position))
            # print((arg_func(group_data) - x[apply_price].index[in_position]))
            return Series((arg_func(group_data) -
                           x[apply_price].index[in_position]).seconds / 60)
        else:
            return group_data

    @staticmethod
    def _future_profit(x, symbol, _direction, apply_price, in_position):
        """期货盈亏计算"""
        open_cost, close_cost = 0.0, []
        if symbol.open_cost_rate != 0.0:
            # print("x: {};\n apply_price: {};\n in_position: {}".format(x, apply_price, in_position))
            open_cost = symbol.open_cost_rate * x[apply_price].iat[
                in_position] * symbol.size_value
        if symbol.close_cost_rate != 0.0:
            close_cost = [
                symbol.close_cost_rate * price * symbol.size_value
                for price in x[apply_price]
            ]
        return symbol.size_value * (
            _direction * (x[apply_price] - x[apply_price].iat[in_position]) -
            symbol.slippage) - close_cost - open_cost

    def _frame_group_analyst(self, _profit_mode, _direction, _group_plot,
                             _applied_price, _axe):
        """使用DataFrameGroupBy类的分组分析"""
        if _profit_mode:
            max_plot = self.__group.apply(self._group_apply_func,
                                          symbol=self.__symbol,
                                          my_func=np.max,
                                          _direction=_direction,
                                          apply_price=_applied_price)
            max_arg = self.__group.apply(self._group_apply_func,
                                         symbol=self.__symbol,
                                         arg_func=np.argmax,
                                         _direction=_direction,
                                         apply_price=_applied_price)
            min_plot = self.__group.apply(self._group_apply_func,
                                          symbol=self.__symbol,
                                          my_func=np.min,
                                          _direction=_direction,
                                          apply_price=_applied_price)
            min_arg = self.__group.apply(self._group_apply_func,
                                         symbol=self.__symbol,
                                         arg_func=np.argmin,
                                         _direction=_direction,
                                         apply_price=_applied_price)
            mean_plot = self.__group.apply(self._group_apply_func,
                                           symbol=self.__symbol,
                                           my_func=np.mean,
                                           _direction=_direction,
                                           apply_price=_applied_price)
            std_plot = self.__group.apply(self._group_apply_func,
                                          symbol=self.__symbol,
                                          my_func=np.std,
                                          _direction=_direction,
                                          apply_price=_applied_price)

            # 每组数据绘制图片
            if _group_plot:
                self.__profit = self.__group.apply(self._group_apply_func,
                                                   symbol=self.__symbol,
                                                   _direction=_direction,
                                                   apply_price=_applied_price)
                for g in self.__profit.index.levels[0]:
                    _axe.plot(self.__profit[g].values)
        else:
            max_plot = self.__group.max()[_applied_price]
            max_arg = self.__group.apply(self._group_apply_func,
                                         arg_func=np.argmax,
                                         _profit_mode=False,
                                         _direction=_direction,
                                         apply_price=_applied_price)
            min_plot = self.__group.min()[_applied_price]
            min_arg = self.__group.apply(self._group_apply_func,
                                         arg_func=np.argmin,
                                         _profit_mode=False,
                                         _direction=_direction,
                                         apply_price=_applied_price)
            mean_plot = self.__group.mean()[_applied_price]
            std_plot = self.__group.std()[_applied_price]

        return pd.concat(
            [max_plot, max_arg, min_plot, min_arg, mean_plot, std_plot],
            axis=1,
            keys=["max", "max_arg", "min", "min_arg", "mean", "std"])

    def _dict_group_analyst(self, _profit_mode, _direction, _group_plot,
                            _applied_price, _axe):
        """字典形式的分组分析"""
        max_list, max_arg_list, min_list, min_arg_list, mean_list, std_list = [], [], [], [], [], []
        index_map = {"open": 0, "high": 1, "low": 2, "close": 3}
        index = index_map[_applied_price]
        in_position = 1
        profit_list = []
        top_index = []
        bottom_index = []
        for key in self.__group:
            data_ = self.__data.iloc[self.__group[key], index]
            if _profit_mode:
                profit = self._future_profit(data_, self.__symbol, _direction,
                                             _applied_price, in_position)
                profit_list.extend(profit.values)
                top_index.extend([key] * len(profit.index))
                bottom_index.extend(profit.index.values)
                max_list.append(profit.max())
                max_arg_list.append(
                    (profit.argmax() -
                     profit.index[in_position]).total_seconds() / 60)
                min_list.append(profit.min())
                min_arg_list.append(
                    (profit.argmin() -
                     profit.index[in_position]).total_seconds() / 60)
                mean_list.append(profit.mean())
                std_list.append(profit.std())
                if _group_plot:
                    _axe.plot(profit.values)
            else:
                max_list.append(data_.max())
                max_arg_list.append(
                    (data_.argmax() -
                     data_.index[in_position]).total_seconds() / 60)
                min_list.append(data_.min())
                min_arg_list.append(
                    (data_.argmax() -
                     data_.index[in_position]).total_seconds() / 60)
                mean_list.append(data_.mean())
                std_list.append(data_.std())
        index = pd.MultiIndex.from_arrays([top_index, bottom_index],
                                          names=[None, 'date'])
        self.__profit = Series(profit_list, index=index)
        return DataFrame({
            "max": max_list,
            "max_arg": max_arg_list,
            "min": min_list,
            "min_arg": min_arg_list,
            "mean": mean_list,
            "std": std_list
        })

    def group_density(self, bin_num=40, window=200, plot_surface=True):
        """绘制每组数据的概率密度随时间的变化图"""
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        fig_1, ax_1 = plt.subplots(3)
        max_profit = self.__profit.max()
        min_profit = self.__profit.min()
        g_profit = self.__profit.groupby(level=0)
        max_list, min_list, mean_list = [], [], []
        xs = []
        ys = []
        zs = []
        for i in range(window):
            max_list.append(g_profit.nth(i).max())
            min_list.append(g_profit.nth(i).min())
            mean_list.append(g_profit.nth(i).mean())
            hist, bins = np.histogram(g_profit.nth(i).values,
                                      bins=np.linspace(min_profit, max_profit,
                                                       bin_num),
                                      density=True)
            xs.append(bins[:-1])
            ys.append(i * np.ones(bin_num - 1))
            zs.append(hist * np.diff(bins))
            if not plot_surface:
                ax.plot(xs[-1], ys[-1], zs=zs[-1])

        ax_1[0].plot(max_list)
        ax_1[0].set_title(u"最大值随时间的演化")
        ax_1[1].plot(min_list)
        ax_1[1].set_title(u"最小值随时间的演化")
        ax_1[2].plot(mean_list)
        ax_1[2].set_title(u"平均值随时间的演化")

        if plot_surface:
            surf = ax.plot_surface(xs,
                                   ys,
                                   zs,
                                   rstride=1,
                                   cstride=1,
                                   cmap=cm.coolwarm,
                                   linewidth=0,
                                   antialiased=False)
            ax.set_zlim(0, 1)
            ax.zaxis.set_major_locator(LinearLocator(10))
            ax.zaxis.set_major_formatter(FormatStrFormatter('% .02f'))
            fig.colorbar(surf, shrink=0.5, aspect=5)
        return fig, fig_1

    def save_group_data(self, file_path, file_patch):
        """保存分组数据"""
        for k in self._data_set:
            self._data_set[k].to_csv(
                os.path.join(file_path,
                             k.lower() + file_patch))

    @staticmethod
    def save_figure(fig_obj, save_path, fig_name):
        """保存图片"""
        for f, n in zip(fig_obj, fig_name):
            f.savefig(os.path.join(save_path, n))

    @staticmethod
    def check_fig_path(path, dir_name):
        """检查存储路径是否合法"""
        if path is None:
            print('dir_name: ', dir_name)
            path = os.path.join(os.getcwd(), "analyst_result", dir_name)
        else:
            path = os.path.join(path, "analyst_result", dir_name)
        if not os.path.exists(path):
            os.makedirs(path)
        return path