Example #1
0
 def test_swapaxes(self):
     df = DataFrame(np.random.randn(10, 5))
     tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
     tm.assert_frame_equal(df.T, df.swapaxes(1, 0))
     tm.assert_frame_equal(df, df.swapaxes(0, 0))
     msg = "No axis named 2 for object type DataFrame"
     with pytest.raises(ValueError, match=msg):
         df.swapaxes(2, 5)
    def MLE_fast_func(self, p0t: list):
        """
        Can only work if beta is not 0

        substritutes p0
        :param p0t: a dict= variables for substitute
        :return: list of the results after substituting of p0
        """
        if self.beta == 0:
            raise KeyError

        #####
        # p0t = [min(i, 0.999) for i in p0t]
        #####

        series_p0 = Series(p0t, self.N_time)

        M = self.Mi(series_p0)

        fixed_k = self.K.swaplevel(0, 1)
        # fixed_n = self.N.swaplevel(0, 1)

        part_a = DataFrame(index=self.N_time,
                           columns=self.N_feature,
                           dtype=float)
        part_c = DataFrame(index=self.N_time,
                           columns=self.N_feature,
                           dtype=float)
        upper_part_for_first_argument = DataFrame(index=self.N_time,
                                                  columns=self.N_feature)
        for i in self.N_feature:
            upper_part_for_first_argument[i] = fixed_k[i].apply(lambda x: (x + self.alpha) * M[i]) - \
                                               self.fixed_n_without_sigma[i]
            part_a[i] = fixed_k[i].apply(lambda x: self.sigma**2 * (
                x + self.alpha) * M[i]) - self.fixed_n_with_sigma[i]
            part_c[i] = series_p0.apply(lambda x: M[i] - x * self.teta[i])
        # part_a = part_a.swapaxes(0, 1)
        part_c = part_c.swapaxes(0, 1)
        upper_part_for_first_argument = upper_part_for_first_argument.swapaxes(
            0, 1)

        return_list = np.array([
            sum(upper_part_for_first_argument[self.N_time[0]] /
                (series_p0[self.N_time[0]] * part_c[self.N_time[0]]))
        ])
        for t in self.N_time[1:]:
            part_b = series_p0[t] * (
                series_p0[t] -
                series_p0[self.N_time[self.N_time.index(t) - 1]])
            # Memory error with alot of days clean memory for avoiding memory error
            # gc.collect()
            return_list = np.append(
                return_list,
                sum((part_a.swapaxes(0, 1)[t] - (part_b * part_c[t])) /
                    part_c[t]))

        return return_list
Example #3
0
 def test_swapaxes(self):
     df = DataFrame(np.random.randn(10, 5))
     tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
     tm.assert_frame_equal(df.T, df.swapaxes(1, 0))
     tm.assert_frame_equal(df, df.swapaxes(0, 0))
     msg = ("No axis named 2 for object type "
            r"<class 'pandas.core(.sparse)?.frame.(Sparse)?DataFrame'>")
     with pytest.raises(ValueError, match=msg):
         df.swapaxes(2, 5)
    def calculate_qi_based_on_MLE(self, p0t):
        """

        :param p0t:
        :return:
        """
        series_p0 = Series(p0t, self.N_time)

        fixed_k = self.K.swaplevel(0, 1)
        fixed_n = self.N.swaplevel(0, 1)

        nominator = Series(index=self.N_feature)
        denominator = Series(index=self.N_feature)
        pre_denominator = fixed_n.apply(lambda x: x + self.alpha + self.beta)
        pre_denominator = pre_denominator.swaplevel(0, 1)
        pre_denominator = pre_denominator.astype(float)
        almost_denominator = DataFrame(columns=self.N_time,
                                       index=self.N_feature,
                                       dtype=float)

        for t in self.N_time:
            almost_denominator[t] = pre_denominator[t].apply(
                lambda x: x * series_p0[t])

        pre_denominator = almost_denominator.swapaxes(0, 1)

        for i in self.N_feature:
            nominator[i] = sum(fixed_k[i].apply(lambda x: x + self.alpha))
            denominator[i] = sum(pre_denominator[i])

        return nominator / denominator
Example #5
0
def create_df(preprocessed_markov_chain):
    """

    :param preprocessed_markov_chain:
    :type preprocessed_markov_chain: dict
    :return:
    :rtype: DataFrame
    """
    df = DataFrame(preprocessed_markov_chain,
                   index=preprocessed_markov_chain.keys())
    df = df.swapaxes(1, 0, copy=False)

    return df
Example #6
0
 def test_swapaxes(self):
     df = DataFrame(np.random.randn(10, 5))
     assert_frame_equal(df.T, df.swapaxes(0, 1))
     assert_frame_equal(df.T, df.swapaxes(1, 0))
     assert_frame_equal(df, df.swapaxes(0, 0))
     pytest.raises(ValueError, df.swapaxes, 2, 5)
Example #7
0
 def test_swapaxes(self):
     df = DataFrame(np.random.randn(10, 5))
     tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
     tm.assert_frame_equal(df.T, df.swapaxes(1, 0))
Example #8
0
 def test_swapaxes_invalid_axis(self):
     df = DataFrame(np.random.randn(10, 5))
     msg = "No axis named 2 for object type DataFrame"
     with pytest.raises(ValueError, match=msg):
         df.swapaxes(2, 5)
Example #9
0
    def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if isinstance(x, DataArray):
            if x.ndim not in (2, 3):
                raise ValueError('Only 2-d or 3-d DataArrays are supported')
            x = x.to_pandas()

        if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                'Series can only be used with a 2-level MultiIndex')

        if isinstance(x, (Panel, DataFrame)):
            if isinstance(x, DataFrame):
                if isinstance(x.index, pd.MultiIndex):
                    if len(x.index.levels) != 2:
                        raise ValueError('DataFrame input must have a '
                                         'MultiIndex with 2 levels')
                    self._frame = x.copy()
                else:
                    self._frame = DataFrame(
                        {var_name: x.T.stack(dropna=False)})
            else:
                self._frame = x.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        elif isinstance(x, ndarray):
            if not 2 <= x.ndim <= 3:
                raise ValueError('2 or 3-d array required for numpy input')
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            variables = [var_name] if k == 1 else [
                var_name + '.{0}'.format(i) for i in range(k)
            ]
            entities = ['entity.{0}'.format(i) for i in range(n)]
            time = list(range(t))
            x = x.astype(np.float64)
            panel = Panel(x,
                          items=variables,
                          major_axis=time,
                          minor_axis=entities)
            self._frame = panel.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        else:
            raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays '
                            'supported.')
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64)

        time_index = Series(self._frame.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError('The index on the time dimension must be either '
                             'numeric or date-like')
        self._k, self._t, self._n = self.panel.shape
        self._frame.index.levels[0].name = 'entity'
        self._frame.index.levels[1].name = 'time'
Example #10
0
    def __init__(self,
                 x,
                 var_name='x',
                 convert_dummies=True,
                 drop_first=True,
                 copy=True):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        self._panel = None
        self._shape = None
        index_names = ['entity', 'time']
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if not isinstance(x, (Series, DataFrame, Panel, np.ndarray)):
            try:
                from xarray import DataArray
                if isinstance(x, DataArray):
                    if x.ndim not in (2, 3):
                        raise ValueError(
                            'Only 2-d or 3-d DataArrays are supported')
                    if x.ndim == 2:
                        x = x.to_pandas()
                    else:
                        items = x.coords[x.dims[0]].values.tolist()
                        major = x.coords[x.dims[1]].values.tolist()
                        minor = x.coords[x.dims[2]].values.tolist()
                        values = x.values
                        x = panel_to_frame(values, items, major, minor, True)
            except ImportError:
                pass

        if isinstance(x, Series) and isinstance(x.index, MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                'Series can only be used with a 2-level MultiIndex')

        if isinstance(x, (Panel, DataFrame)):
            if isinstance(x, DataFrame):
                if isinstance(x.index, MultiIndex):
                    if len(x.index.levels) != 2:
                        raise ValueError('DataFrame input must have a '
                                         'MultiIndex with 2 levels')
                    if isinstance(self._original,
                                  (DataFrame, PanelData, Series)):
                        for i in range(2):
                            index_names[
                                i] = x.index.levels[i].name or index_names[i]
                    self._frame = x
                    if copy:
                        self._frame = self._frame.copy()
                else:
                    self._frame = DataFrame(
                        {var_name: x.T.stack(dropna=False)})
            else:
                self._frame = x.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        elif isinstance(x, np.ndarray):
            if x.ndim not in (2, 3):
                raise ValueError('2 or 3-d array required for numpy input')
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}'
            variables = [var_name] if k == 1 else [
                var_str.format(i) for i in range(k)
            ]
            entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}'
            entities = [entity_str.format(i) for i in range(n)]
            time = list(range(t))
            x = x.astype(np.float64, copy=False)
            panel = _Panel.from_array(x,
                                      items=variables,
                                      major_axis=time,
                                      minor_axis=entities)
            self._fake_panel = panel
            self._frame = panel.to_frame()
        else:
            raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays '
                            'are supported')
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64, copy=False)

        time_index = Series(self._frame.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError('The index on the time dimension must be either '
                             'numeric or date-like')
        # self._k, self._t, self._n = self.panel.shape
        self._k, self._t, self._n = self.shape
        levels = self._frame.index.levels
        for i in range(2):
            levels[i].name = index_names[i]
d.swaplevel(0,1)
#%%
d.swaplevel('row1','row2')
#%% 交换列索引顺序 
d.swaplevel(0,1,axis=1)

#%% 将行索引的最低一级转化为列索引 
d.unstack()
#%% 将列索引的最低一级转化为行索引
d.stack()
# 疑问:
# d.stack().stack() 变成一个系列是可以理解的,但为何d.unstack().unstack() 
# 也变成一个系列了

#%% 互换行和列
d.swapaxes(0,1)

# 将索引转化为列
d1=d.reset_index()
d1

#%% 将列转化为索引
d1.set_index('row1')
#%% 转化后保留列
d1.set_index('row1',drop=False)
#%% 将两个列转化为索引
d1.set_index(['row1','row2'])
#%% 如果列名和索引名称重复,会失败
d1.set_index('row1',drop=False).reset_index()
#%% 重命名即可解决
d2=d1.set_index('row1',drop=False)
d3.pivot_table('value1',rows='bar',cols='foo',aggfunc=sum) 
#%%
# 由于有重复不能做pivot操作
d3.pivot('foo','bar','value1')     # error

#%% 将行数据转化为列名
d4=d1.set_index([d1.index,'foo']).unstack()
d4
#%% 将列名转化为行数据
d4=d4.swaplevel(0,1,axis=1).stack()
d4.set_index(d4.index.droplevel(0)).reset_index()
#%% 交换数据框的横
# DataFrame.swapaxes(axis1, axis2, copy=True) 
# axis = 0 表示行,axis = 0 表示列,真不明白这个函数为啥需要这两个参数
# d1.swapaxes(0,1) 和 d1.swapaxes(1,0) 都管用
d1.swapaxes(0,1)

#%%
'''
我们通常指的行转列是把一整列的数据(也就是一整列中的所有行)转化到一级列名上,也可以把
一级列名转化到一个数据列上去(列转行),但似乎却无法直接将一级行索引转化成一行数据,这
说明pandas的行列操作也不是完全平衡的。当然在使用swapaxes后就可以了,当然这种操作奇怪得
很,估计是很难用到的。以下演示了这样的转化
'''
#%% 将一整列的数据转化为一整行 
d6=d1.set_index([d1.index,'foo']).unstack()
d6.swapaxes(0,1).reset_index().set_index('level_0').swapaxes(0,1)
#%%
'''
这里涉及到一个数据分析处理的惯性思维的问题:我们通常要求一列的数据是同一种类型,而很少
要求一行的数据是同一种类型。所以在看到这种奇怪的数据转换,比较难以理解且说明白其代表的