Exemple #1
0
        def _test_stack_with_multiindex(multiindex):
            df = DataFrame(
                np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
                columns=multiindex,
            )
            for level in (-1, 0, 1, [0, 1], [1, 0]):
                result = df.stack(level=level, dropna=False)

                if isinstance(level, int):
                    # Stacking a single level should not make any all-NaN rows,
                    # so df.stack(level=level, dropna=False) should be the same
                    # as df.stack(level=level, dropna=True).
                    expected = df.stack(level=level, dropna=True)
                    if isinstance(expected, Series):
                        tm.assert_series_equal(result, expected)
                    else:
                        tm.assert_frame_equal(result, expected)

                df.columns = MultiIndex.from_tuples(df.columns.to_numpy(),
                                                    names=df.columns.names)
                expected = df.stack(level=level, dropna=False)
                if isinstance(expected, Series):
                    tm.assert_series_equal(result, expected)
                else:
                    tm.assert_frame_equal(result, expected)
Exemple #2
0
    def test_stack_mixed_level(self):
        # GH 18310
        levels = [range(3), [3, 'a', 'b'], [1, 2]]

        # flat columns:
        df = DataFrame(1, index=levels[0], columns=levels[1])
        result = df.stack()
        expected = Series(1, index=MultiIndex.from_product(levels[:2]))
        assert_series_equal(result, expected)

        # MultiIndex columns:
        df = DataFrame(1,
                       index=levels[0],
                       columns=MultiIndex.from_product(levels[1:]))
        result = df.stack(1)
        expected = DataFrame(1,
                             index=MultiIndex.from_product(
                                 [levels[0], levels[2]]),
                             columns=levels[1])
        assert_frame_equal(result, expected)

        # as above, but used labels in level are actually of homogeneous type
        result = df[['a', 'b']].stack(1)
        expected = expected[['a', 'b']]
        assert_frame_equal(result, expected)
Exemple #3
0
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'),
             ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length'])
        df = DataFrame(randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ['exp', 'animal', 1]
        assert_frame_equal(df2.stack(level=['animal', 1]),
                           animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=['exp', 1]),
                           exp_hair_stacked,
                           check_names=False)

        # When mixed types are passed and the ints are not level
        # names, raise
        pytest.raises(ValueError, df2.stack, level=['animal', 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ['exp', 'animal', 0]
        assert_frame_equal(df3.stack(level=['animal', 0]),
                           animal_hair_stacked,
                           check_names=False)
Exemple #4
0
    def test_stack_int_level_names(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'),
             ('A', 'dog', 'short'), ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length']
        )
        df = DataFrame(randn(4, 4), columns=columns)

        exp_animal_stacked = df.stack(level=['exp', 'animal'])
        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        df2 = df.copy()
        df2.columns.names = [0, 1, 2]
        assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
                           check_names=False)

        # Out-of-order int column names
        df3 = df.copy()
        df3.columns.names = [2, 0, 1]
        assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
                           check_names=False)
Exemple #5
0
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'),
             ('A', 'dog', 'short'), ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length']
        )
        df = DataFrame(randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ['exp', 'animal', 1]
        assert_frame_equal(df2.stack(level=['animal', 1]),
                           animal_hair_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=['exp', 1]),
                           exp_hair_stacked, check_names=False)

        # When mixed types are passed and the ints are not level
        # names, raise
        pytest.raises(ValueError, df2.stack, level=['animal', 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ['exp', 'animal', 0]
        assert_frame_equal(df3.stack(level=['animal', 0]),
                           animal_hair_stacked, check_names=False)
Exemple #6
0
    def test_stack_int_level_names(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'),
             ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length'])
        df = DataFrame(randn(4, 4), columns=columns)

        exp_animal_stacked = df.stack(level=['exp', 'animal'])
        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        df2 = df.copy()
        df2.columns.names = [0, 1, 2]
        assert_frame_equal(df2.stack(level=[1, 2]),
                           animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 1]),
                           exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 2]),
                           exp_hair_stacked,
                           check_names=False)

        # Out-of-order int column names
        df3 = df.copy()
        df3.columns.names = [2, 0, 1]
        assert_frame_equal(df3.stack(level=[0, 1]),
                           animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 0]),
                           exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 1]),
                           exp_hair_stacked,
                           check_names=False)
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")],
            names=["exp", "animal", "hair_length"],
        )
        df = DataFrame(randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=["animal", "hair_length"])
        exp_hair_stacked = df.stack(level=["exp", "hair_length"])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ["exp", "animal", 1]
        assert_frame_equal(df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False)

        # When mixed types are passed and the ints are not level
        # names, raise
        self.assertRaises(ValueError, df2.stack, level=["animal", 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ["exp", "animal", 0]
        assert_frame_equal(df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False)
Exemple #8
0
def ler_base_fechamento(path, tipo, nome):
    nome = '01_fechamento'
    file = path + nome + tipo
    fechamento = read_csv(file, sep=';')
    fechamento = fechamento.set_index('codigo')
    datas = fechamento.columns.values.tolist()
    tikers = fechamento.index.values.tolist()
    values = fechamento.values
    fechamento = DataFrame(values,
                           index=tikers,
                           columns=datas,
                           dtype='float64')
    fechamento = fechamento.stack()

    nome = '09_lpa'
    file = path + nome + tipo
    lpa = read_csv(file, sep=';')
    lpa = lpa.set_index('codigo')

    datas = lpa.columns.values.tolist()
    tikers = lpa.index.values.tolist()
    values = lpa.values

    lpa = DataFrame(values, index=tikers, columns=datas, dtype='float64')
    lpa = lpa.stack()

    lpa = lpa.unstack()

    dados_ibov = fechamento.join(lpa, how='outer')
Exemple #9
0
def cal_SMB_HML(ret,
                size,
                BM,
                percentile1=None,
                percentile2=None,
                independent=True,
                exclude_30_small_size=False):
    if exclude_30_small_size:
        size = ClipQuantile(size, [0.0, 0.3, 1.0], [-1.0, 1.0])
    ret, size, BM = IndexAlign(ret, size, BM)
    valid_ = ~pd.isnull(
        BM + ret + size
    )  # TypeError: bad operand type for unary ~: 'float'--->index或columns不匹配
    size = size[valid_]
    BM = BM[valid_]
    ret = ret[valid_]
    if percentile1 is None:
        percentile1 = [0.0, 0.5, 1.0]  # size
        percentile2 = [0.0, 0.3, 0.7, 1.0]  # value
    label_1 = [i + 1 for i in range(len(percentile1) - 1)]
    label_2 = [i + 1 for i in range(len(percentile2) - 1)]
    if independent:
        #mark_1 = pd.DataFrame([pd.qcut(size.iloc[i], q=percentile1, labels=label_1) for i in size.index[:-1]],
        #                      index=size.index[:-1]) # 报错
        mark_1 = DataFrame([
            qcut(size.loc[i], q=percentile1, labels=label_1)
            for i in size.index
        ])
        mark_2 = DataFrame([
            qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index
        ])  # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的
    else:
        mark_1 = DataFrame([
            qcut(size.loc[i], q=percentile1, labels=label_1)
            for i in size.index
        ])  # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的
        mark_2 = DataFrame(index=mark_1.index, columns=mark_1.columns)
        for l_ in label_1:
            tmp = DataFrame([
                qcut(BM.loc[i][mark_1.iloc[i] == l_],
                     q=percentile2,
                     labels=label_2) for i in BM.index
            ])
            mark_2 = mark_2.combine_first(tmp)
    #valid_ = ~(pd.isnull(mark_1 + mark_2) | pd.isnull(ret.iloc[1:]))  # valid的股票要满足:当期有前一个月的indicator信息;当期保证交易
    df = DataFrame()
    df['rtn'] = ret.stack()
    df['ref1'] = mark_1.stack()
    df['ref2'] = mark_2.stack()
    tmp = df.groupby(level=0).apply(
        lambda g: g.groupby(['ref1', 'ref2']).mean()).unstack()['rtn']
    #tmp.columns = tmp.columns.get_level_values(1)
    tmp.index.names = ('trddt', 'ref1')
    HML = tmp.mean(axis=0, level=0)
    SMB = tmp.mean(axis=1).unstack()
    return SMB.iloc[:, -1] - SMB.iloc[:, 0], HML.iloc[:, -1] - HML.iloc[:, 0]
    def test_stack_ints(self):
        columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3)))
        df = DataFrame(np.random.randn(30, 27), columns=columns)

        assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1))
        assert_frame_equal(df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1))

        df_named = df.copy()
        df_named.columns.set_names(range(3), inplace=True)

        assert_frame_equal(df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1))
Exemple #11
0
def cal_SMB_HML_FF(ret, EndDate, size=None, book=None, weights=None):
    # TODO return要和EndDate的频率保持一致
    percentile1 = [0.0, 0.5, 1.0]  # size
    percentile2 = [0.0, 0.3, 0.7, 1.0]  # value
    label_1 = [i + 1 for i in range(len(percentile1) - 1)]
    label_2 = [i + 1 for i in range(len(percentile2) - 1)]
    size, book = import_data(PV_vars=['size_tot'],
                             BS_vars=['tot_shrhldr_eqy_excl_min_int'])[:2]
    BE = book.drop(book.index[book.index.duplicated(keep='last')]).unstack()
    BE = BE[BE.index.month == 12]
    BE = BE[BE > 0]
    size = size.unstack()
    ME = size.copy()
    ME = ME.resample('M').last()
    ME6 = ME[ME.index.month == 6]
    ME12 = ME[ME.index.month == 12]
    ME12.loc[parse('20041231')] = size.loc['2005-01-04']
    ME12 = ME12.sort_index()
    BM = BE.reindex(index=ME12.index, columns=ME12.columns) / ME12
    mark_1 = DataFrame(
        [qcut(ME6.loc[i], q=percentile1, labels=label_1) for i in ME6.index])
    mark_1.index = mark_1.index + Day()
    mark_1 = mark_1.resample('D').ffill().reindex(index=EndDate)
    mark_2 = DataFrame(
        [qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index])
    mark_2.index = mark_2.index + MonthBegin(7)
    mark_2 = mark_2.resample('D').ffill().reindex(index=EndDate)

    if weights is None:
        df = DataFrame()
        df['ret'] = ret.stack()
        df['ref1'] = mark_1.stack()
        df['ref2'] = mark_2.stack()
        df = df.dropna()
        tmp = df.groupby(level=0).apply(
            lambda g: g.groupby(['ref1', 'ref2']).mean())['ret'].unstack()
    else:
        weights = size.resample('D').ffill().reindex(index=EndDate).shift(1)
        df = DataFrame()
        df['ret'] = (ret * weights).stack()
        df['ref1'] = mark_1.stack()
        df['ref2'] = mark_2.stack()
        df['w'] = weights.stack()
        df = df.dropna()
        tmp1 = df.groupby(
            level=0).apply(lambda g: g.groupby(['ref1', 'ref2']).sum())['ret']
        tmp2 = df.groupby(
            level=0).apply(lambda g: g.groupby(['ref1', 'ref2']).sum())['w']
        tmp = (tmp1 / tmp2).unstack()
    rHML = tmp.mean(axis=0, level=0)
    rSMB = tmp.mean(axis=1).unstack()
    return rSMB.iloc[:, -1] - rSMB.iloc[:, 0], rHML.iloc[:, -1] - rHML.iloc[:,
                                                                            0]
Exemple #12
0
    def test_stack_ints(self):
        df = DataFrame(np.random.randn(30, 27),
                       columns=MultiIndex.from_tuples(
                           list(itertools.product(range(3), repeat=3))))
        assert_frame_equal(df.stack(level=[1, 2]),
                           df.stack(level=1).stack(level=1))
        assert_frame_equal(df.stack(level=[-2, -1]),
                           df.stack(level=1).stack(level=1))

        df_named = df.copy()
        df_named.columns.set_names(range(3), inplace=True)
        assert_frame_equal(df_named.stack(level=[1, 2]),
                           df_named.stack(level=1).stack(level=1))
Exemple #13
0
def get_corrs(z_sc_df: pd.DataFrame, merged_df: pd.DataFrame) -> pd.DataFrame:
    logger.info('Getting available hgnc symbols from correlation matrix')
    corr_symb_set = set(z_sc_df.columns.values)
    logger.info('Stacking the correlation matrix: may take a couple of '
                'minutes and tens of GiB of memory')
    stacked_z_sc_df = z_sc_df.stack(dropna=True).to_frame(
        name='z_score', ).reset_index().rename(columns={
            'level_0': 'agA_name',
            'level_1': 'agB_name'
        })

    # Merge in stacked correlations to the sif df
    logger.info('Getting relevant correlations')
    z_corr_pairs = merged_df[['agA_name',
                              'agB_name']].merge(right=stacked_z_sc_df,
                                                 how='left').drop_duplicates()

    # z_score: original z-score or 0 if nonexistant
    z_corr_pairs.loc[z_corr_pairs.z_score.isna(), 'z_score'] = 0

    # Get self correlation
    self_corr = z_sc_df.iloc[0, 0]
    assert isinstance(self_corr, (int, float)) and self_corr > 0

    # Calculate corr weight = (self_corr_z_sc - abs(z_score)) / self_corr
    z_corr_pairs['corr_weight'] = z_sc_weight_df(z_corr_pairs, self_corr)
    logger.info('Finished setting z-score and z-score weight in sif df')
    return z_corr_pairs
    def chi(self, customattribute):
        """
        计算其卡方值.
        """
        attributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems():
                attributeDict.setdefault((attribute, classAttribute), np.array([]))
                attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays)

            for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems():
                classAttributeDict.setdefault(classAttribute, np.array([]))
                classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays)

        #各个类别的毕业去向群体中所占的比例.
        classSeries = Series(classAttributeDict).apply(lambda x:len(x))
        classSeries /= classSeries.sum()

        #在各个attribute上的实际观测值.
        attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0)

        attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns)

        #设置初始值.
        for index in attributeExp.index:
            attributeExp.ix[index] = attributeObs.ix[index].sum()
        #根据各个目标类别中的比例来获得其期望值.
        attributeExp = attributeExp.mul(classSeries).fillna(0)
        #根据实际观测值与期望值来计算其卡方值,并返回p-value值.
        return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
def pandas_reshape_pivot_part1():
    # 重塑层次化索引
    # stack: 将数据的列"旋转"为行; unstack: 将数据的行"旋转"为列
    data = DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['Ohio', 'Colorado'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))

    # stack()是将原来的列索引转成了最内层的行索引,把df转换为series
    result = data.stack()
    # print(result)

    # unstack()最内层的行索引还原成了列索引,把series转换为df
    # print(result.unstack())

    # 默认stack(),unstack()操作的是最内层,也可以对指定分层级别进行操作
    # print(result.unstack(0))  # 最外层编号0,依次增加
    # print(result.unstack('state'))

    s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
    s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
    # data2 = pd.concat([s1, s2], keys=['one', 'two'])
    # print(data2.unstack())  # 对于级别值在分组中找不到数据,则引入Nan

    df = DataFrame({
        'left': result,
        'right': result + 5
    },
                   columns=pd.Index(['left', 'right'], names='side'))
    print(df)
def clean_correlations(df_cor: pd.DataFrame) -> pd.DataFrame:
    """
    Function to stack correlation matrix, remove duplicate entries and sort the values.
    :param df_cor: (pd.DataFrame) Correlation matrix
    :return: pd.DataFrame as stacked correlation matrix
    """

    # stack them to make it easier to go through
    df_cor = df_cor.stack().reset_index()
    df_cor.columns = ["Var_1", "Var_2", "Correlation"]

    # want to through out duplicates
    df_cor["index"] = df_cor.apply(lambda x: _return_index(x), axis=1)
    df_mean = df_cor.groupby("index").mean().reset_index()
    df_mean.rename(columns={"Correlation": "Cor_mean"}, inplace=True)
    df_cor = pd.merge(df_cor, df_mean, how="left", on="index")

    df_cor = df_cor[(df_cor["Var_1"] < df_cor["Var_2"])
                    | (np.abs(df_cor["Correlation"] -
                              df_cor["Cor_mean"]) > 0.000001)]

    # drop the mean column (only used to find duplicates)
    df_cor.drop(["index", "Cor_mean"], axis=1, inplace=True)

    df_cor.sort_values(by="Correlation", ascending=False, inplace=True)

    return df_cor
Exemple #17
0
 def _calc_firing_rate(self,
                       num_peaks: pd.DataFrame,
                       epoch: str = "All_cells"):
     """
     Sum all indices of peaks to find the average firing rate of cells in the three epochs
     :return:
     """
     # Remove silent cells from comparison
     split_data = num_peaks.stack()
     mc = MultiComparison(split_data.values,
                          split_data.index.get_level_values(1).values)
     try:
         res = mc.tukeyhsd()
     except ValueError:
         aprint("<yellow>Failed during the p-value calculation.</yellow>")
     else:
         print(res)
         print(
             f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):",
             psturng(
                 np.abs(res.meandiffs / res.std_pairs),
                 len(res.groupsunique),
                 res.df_total,
             ),
         )
     finally:
         print(split_data.mean(level=1))
Exemple #18
0
def plot_cond_prob(p_A_given_B: pd.DataFrame):
    # Get the name of the index column
    B = p_A_given_B.index.name

    # If the B column is numeric
    if p_A_given_B.index.is_numeric():
        return p_A_given_B.reset_index().plot.area(x=B, figsize=(15, 10))
    # If categorical or string
    elif p_A_given_B.index.is_object() | p_A_given_B.index.is_categorical():
        # What is the name of the A column?
        temp = p_A_given_B.stack().rename("prob").reset_index()
        A_name = np.setdiff1d(temp.columns, [B, "prob"])[0]

        # Create the plot
        return (p_A_given_B.stack().rename("prob").unstack(A_name).plot.barh(
            figsize=(15, 10), stacked=True))
Exemple #19
0
def player_data_classifier(attr, reverse=0):
    main_data = pd.read_csv('data.csv')
    subtable_kill = main_data['player1_' + attr] + main_data[
        'player2_' + attr] + main_data['player3_' + attr] + main_data[
            'player4_' + attr] + main_data['player5_' + attr]
    subtable_kill.name = 'player_' + attr
    subtable = DataFrame([main_data['team1_win'], subtable_kill])
    subtable = subtable.stack().unstack(0)
    subtable.index.name = 'index'
    m = min(subtable_kill)
    M = max(subtable_kill)
    max_accuracy = 0
    max_i = m

    def no(x):
        if (x == 0):
            return 1
        else:
            return 0

    for i in range(m, M + 1):
        if (M - m >= 40):
            if ((i - m) % 100 == 0):
                print('This is the ', i - m, '/', M - m, 'th iteration.')
        cross_entropy = 0
        tmp1 = subtable['player_' + attr] >= i
        tmp2 = subtable['player_' + attr] < i
        u = subtable['team1_win'][tmp1]
        v = subtable['team1_win'][tmp2]
        if (reverse == 0):
            accuracy = (u.sum() + v.apply(no).sum()) / len(subtable.index)
        else:
            accuracy = (v.sum() + u.apply(no).sum()) / len(subtable.index)
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_i = i
    print('max_accuracy=', max_accuracy)
    print('corresponding classifier i=', max_i)
    tmp = subtable[subtable['player_' +
                            attr] >= max_i]['team1_win'].value_counts()
    if (reverse == 0):
        tmp.index = ['team1_win', 'team1_loss']
    else:
        tmp.index = ['team1_loss', 'team1_win']
    plt.title('player_' + attr + '>=' + str(max_i))
    tmp.plot(kind='barh')
    plt.show()
    print('player_' + attr + '>=', max_i, '对应的玩家胜负情况统计')
    print(tmp)
    tmp = subtable[subtable['player_' +
                            attr] < max_i]['team1_win'].value_counts()
    if (reverse == 0):
        tmp.index = ['team1_loss', 'team1_win']
    else:
        tmp.index = ['team1_win', 'team1_loss']
    plt.title('player_' + attr + '<' + str(max_i))
    tmp.plot(kind='barh')
    plt.show()
    print('player_' + attr + '<', max_i, '对应的玩家胜负情况统计')
    print(tmp)
Exemple #20
0
def interp_to_obs(var, df, lat, lon, radius=12000.):
    """Short summary.

    Parameters
    ----------
    var : type
        Description of parameter `var`.
    df : type
        Description of parameter `df`.
    lat : type
        Description of parameter `lat`.
    lon : type
        Description of parameter `lon`.
    radius : type
        Description of parameter `radius` (the default is 12000.).

    Returns
    -------
    type
        Description of returned object.

    """
    from numpy import NaN, vstack
    from pyresample import geometry, image
    from pandas import to_timedelta, DataFrame
    # define CMAQ pyresample grid (source)
    grid1 = geometry.GridDefinition(lons=lon, lats=lat)
    # get unique sites from df
    dfn = df.drop_duplicates(subset=['Latitude', 'Longitude'])
    # define site grid (target)
    lats = dfn.Latitude.values
    lons = dfn.Longitude.values
    grid2 = geometry.GridDefinition(lons=vstack(lons), lats=vstack(lats))
    # Create image container
    i = image.ImageContainerNearest(var.transpose('y', 'x', 'time').values,
                                    grid1,
                                    radius_of_influence=radius,
                                    fill_value=NaN)
    # resample
    ii = i.resample(grid2).image_data.squeeze()
    # recombine data
    e = DataFrame(ii, index=dfn.SCS, columns=var.time.values)
    w = e.stack().reset_index().rename(columns={
        'level_1': 'datetime',
        0: 'model'
    })
    w = w.merge(dfn.drop(['datetime', 'datetime_local', 'Obs'], axis=1),
                on='SCS',
                how='left')
    w = w.merge(df[['datetime', 'SCS', 'Obs']],
                on=['SCS', 'datetime'],
                how='left')
    # calculate datetime local

    w['datetime_local'] = w.datetime + to_timedelta(w.utcoffset, 'H')

    return w
Exemple #21
0
    def _plot_wcorr(Wcorr, L):

        f = figure(tools="box_select, pan, reset, save")
        f.plot_width = 700
        f.plot_height = 600

        # Background settings
        f.background_fill_color = '#859dcd'
        f.background_fill_alpha = 0.05

        # Title settings
        f.title.text = "W-Correlation for L={}".format(L)
        f.title.text_font = 'Helvetica'
        f.title.text_font_size = '24px'
        f.title.align = 'center'
        f.title.text_font_style = "italic"

        # Axis settings
        f.xaxis.axis_label = 'Fⱼ'
        f.yaxis.axis_label = 'Fᵢ'
        f.axis.axis_label_text_font = 'Helvetica'
        f.axis.axis_label_text_font_size = '24px'
        f.axis.major_label_orientation = 0
        f.x_range = Range1d(start=0.5, end=L + 0.5)
        f.y_range = Range1d(start=L + 0.5, end=0.5)
        f.axis[0].ticker.desired_num_ticks = L
        f.axis[0].ticker.num_minor_ticks = 0

        data = DataFrame(Wcorr)
        axis = [i for i in range(1, Wcorr.shape[0] + 1)]

        data['F_i'] = axis
        data.set_index('F_i', inplace=True)

        data.columns = axis
        data.columns.name = 'F_j'
        df = DataFrame(data.stack(), columns=['corr']).reset_index()
        source = ColumnDataSource(df)

        # this is the colormap from the original NYTimes plot

        mapper = LinearColorMapper(palette=color.palettes['colors_2'], low=0, high=1)

        f.rect(x="F_i", y="F_j", width=1, height=1, source=source,
               line_color=None, fill_color=transform('corr', mapper))

        color_bar = ColorBar(color_mapper=mapper, location=(0, 0),
                             ticker=BasicTicker(desired_num_ticks=len(color.palettes['colors_2'])),
                             formatter=PrintfTickFormatter(format="%.2f"))

        f.add_layout(color_bar, 'right')

        hover = HoverTool(tooltips=[("Components", "(@F_i, @F_j)"),
                                    ("Correlations", "@corr")])
        f.add_tools(hover)

        show(f)
    def test_stack_partial_multiIndex(self):
        # GH 8844
        def _test_stack_with_multiindex(multiindex):
            df = DataFrame(np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex)
            for level in (-1, 0, 1, [0, 1], [1, 0]):
                result = df.stack(level=level, dropna=False)

                if isinstance(level, int):
                    # Stacking a single level should not make any all-NaN rows,
                    # so df.stack(level=level, dropna=False) should be the same
                    # as df.stack(level=level, dropna=True).
                    expected = df.stack(level=level, dropna=True)
                    if isinstance(expected, Series):
                        assert_series_equal(result, expected)
                    else:
                        assert_frame_equal(result, expected)

                df.columns = MultiIndex.from_tuples(df.columns.get_values(), names=df.columns.names)
                expected = df.stack(level=level, dropna=False)
                if isinstance(expected, Series):
                    assert_series_equal(result, expected)
                else:
                    assert_frame_equal(result, expected)

        full_multiindex = MultiIndex.from_tuples(
            [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"]
        )
        for multiindex_columns in (
            [0, 1, 2, 3, 4],
            [0, 1, 2, 3],
            [0, 1, 2, 4],
            [0, 1, 2],
            [1, 2, 3],
            [2, 3, 4],
            [0, 1],
            [0, 2],
            [0, 3],
            [0],
            [2],
            [4],
        ):
            _test_stack_with_multiindex(full_multiindex[multiindex_columns])
            if len(multiindex_columns) > 1:
                multiindex_columns.reverse()
                _test_stack_with_multiindex(full_multiindex[multiindex_columns])

        df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]])
        result = df.stack(dropna=False)
        expected = DataFrame(
            [[0, 2], [1, nan], [3, 5], [4, nan]],
            index=MultiIndex(
                levels=[[0, 1], ["u", "x", "y", "z"]], labels=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"]
            ),
            columns=Index(["B", "C"], name="Upper"),
            dtype=df.dtypes[0],
        )
        assert_frame_equal(result, expected)
Exemple #23
0
def find_closest_from_another(table: pd.DataFrame) -> pd.Series:
    """
    Returns as Series of tuples:
    species | (seqid_of_closest, species_of_closest, seqid_of_self)
    """
    table = table.copy()
    for lbl in table.index.levels[1]:
        table.loc[(slice(None), lbl), (slice(None), lbl)] = np.nan
    return table.stack(level=0).idxmin()
Exemple #24
0
def stack_on_colnames(
    dframe: pd.DataFrame,
    sep: str = "@",
    stackcolname: str = "DATE",
    inplace: bool = True,
) -> pd.DataFrame:
    """For a dataframe where some columns are multilevel, but where
    the second level is encoded in the column name, this function
    will stack the dataframe by putting the second level of the column
    multiindex into its own column, best understood by this example:

    A dframe like this

       ===== =============== ==============
       PORV   OWC@2000-01-01 OWC@2020-01-01
       ===== =============== ==============
       100       1000          990
       ===== =============== ==============

    will be stacked to

       ====  ====  ==========
       PORV  OWC   DATE
       ====  ====  ==========
       100   1000  2000-01-01
       100   990   2020-01-01
       ====  ====  ==========

    (for the defaults values for *sep* and *stackcolname*)

    Column order is not guaranteed

    Args:
        dframe: A dataframe to stack
        sep: The separator that is used in dframe.columns to define
            the multilevel column names.
        stackcolname: Used as column name for the second level
            of the column multiindex
    """
    if not inplace:
        dframe = dframe.copy()
    tuplecolumns = list(map(lambda x: tuple(x.split(sep)), dframe.columns))
    if max(map(len, tuplecolumns)) < 2:
        logger.info("No columns to stack")
        return dframe
    dframe.columns = pd.MultiIndex.from_tuples(tuplecolumns,
                                               names=["dummy", stackcolname])
    dframe = dframe.stack()
    staticcols = [col[0] for col in tuplecolumns if len(col) == 1]
    dframe[staticcols] = dframe[staticcols].fillna(method="ffill")
    dframe.reset_index(inplace=True)
    # Drop rows stemming from the NaNs in the second tuple-element for
    # static columns:
    dframe.dropna(axis="index", subset=["DATE"], inplace=True)
    del dframe["level_0"]
    dframe.index.name = ""
    return dframe
Exemple #25
0
def get_factor_data(
    factor: pd.DataFrame,
    price_data: pd.DataFrame,
    periods: Optional[List[int]] = None,
    split: Union[int, Sequence[float]] = 3,
    long_short: bool = False,
    leverage: float = 1,
    name: str = "",
) -> pd.DataFrame:
    """Return merged data: factor values, quantiles, weights and returns."""
    prices = price_data.xs("close", axis=1, level=1).filter(factor.columns)
    if factor.index.tz != prices.index.tz:
        raise ValueError("The time zone of `factor` and `prices` don't match.")
    factor.loc[datetime.now(timezone.utc)] = float("nan")
    factor.replace([float("-inf"), float("inf")], float("nan"), inplace=True)
    factor = factor.resample(prices.index.freq).ffill()[prices.index[0] :]
    periods = [1] if not periods else [1] + sorted(periods)
    deltas = [period * prices.index.to_series().diff().mode() for period in periods]
    deltas = [
        (
            delta.to_string(index=False).replace(":", "h", 1).replace(":", "m") + "s"
        ).replace(" dayss", "D")
        for delta in deltas
    ]
    forward_returns = {
        delta: -prices.diff(-period) / prices
        for period, delta in dict(zip(periods, deltas)).items()
    }
    index = factor.index.intersection(prices.index)
    factor_data = pd.concat(forward_returns, axis=1).reindex(index).stack()
    factor_data["factor"] = factor.stack()
    if isinstance(split, int):
        factor_quantile = 1 + factor_data.groupby(level=0)["factor"].transform(
            lambda x: pd.qcut(x, split, labels=False, duplicates="drop")
        )
    elif isinstance(split, (list, tuple, set)):
        factor_quantile = 1 + factor_data.groupby(level=0)["factor"].transform(
            lambda x: pd.cut(x, split, labels=False, duplicates="drop")
        )
        split = len(split) - 1
    else:
        raise ValueError(f"Factor `{name}` split type {type(split)} is not supported.")
    factor_data["factor_quantile"] = factor_quantile
    quantiles = [1, split] if long_short else list(range(1, split + 1))
    factor_data["weights"] = (
        factor_data[factor_data["factor_quantile"].isin(quantiles)]
        .groupby(level=0)["factor"]
        .transform(lambda x: (x - x.mean()) / (x - x.mean()).abs().sum())
    )
    factor_data["weights"].fillna(0, inplace=True)
    for period in forward_returns:
        factor_data[f"{name}_{period}"] = (
            factor_data["weights"] * factor_data[period] * leverage
        )
    factor_data.rename_axis(index=["date", "asset"], inplace=True)
    factor_data.name = name
    return factor_data
Exemple #26
0
def _yearly_to_monthly_records(df: pd.DataFrame) -> pd.DataFrame:
    """Converts an EIA 923 record of 12 months of data into 12 monthly records.

    Much of the data reported in EIA 923 is monthly, but all 12 months worth of data is
    reported in a single record, with one field for each of the 12 months.  This
    function converts these annualized composite records into a set of 12 monthly
    records containing the same information, by parsing the field names for months, and
    adding a month field.  Non - time series data is retained in the same format.

    Args:
        df: A pandas DataFrame containing the annual data to be
            converted into monthly records.

    Returns:
        A dataframe containing the same data as was passed in via df,
        but with monthly records as rows instead of as columns.

    """
    month_dict = {
        'january': 1,
        'february': 2,
        'march': 3,
        'april': 4,
        'may': 5,
        'june': 6,
        'july': 7,
        'august': 8,
        'september': 9,
        'october': 10,
        'november': 11,
        'december': 12
    }
    multi_idx = df.columns.str.rsplit("_", n=1, expand=True).set_names(
        [None, 'report_month'])
    ends_with_month_filter = multi_idx.get_level_values('report_month').isin(
        set(month_dict.keys()))
    if not ends_with_month_filter.any():
        return df
    index_cols = df.columns[~ends_with_month_filter]
    # performance note: this was good enough for eia923 data size.
    # Using .set_index() is simple but inefficient due to unecessary index creation.
    # Performance may be improved by separating into two dataframes,
    # .stack()ing the monthly data, then joining back together on the original index.
    df = df.set_index(list(index_cols), append=True)
    # convert month names to numbers (january -> 1)
    col_df = multi_idx[ends_with_month_filter].to_frame(index=False)
    col_df.loc[:, 'report_month'] = col_df.loc[:,
                                               'report_month'].map(month_dict)
    month_idx = pd.MultiIndex.from_frame(col_df).set_names(
        [None, 'report_month'])
    # reshape
    df.columns = month_idx
    df = df.stack()
    # restore original index and columns - reset index except level 0
    df = df.reset_index(level=list(range(1, df.index.nlevels)))
    return df
Exemple #27
0
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [
                ("A", "cat", "long"),
                ("B", "cat", "long"),
                ("A", "dog", "short"),
                ("B", "dog", "short"),
            ],
            names=["exp", "animal", "hair_length"],
        )
        df = DataFrame(np.random.randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=["animal", "hair_length"])
        exp_hair_stacked = df.stack(level=["exp", "hair_length"])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ["exp", "animal", 1]
        tm.assert_frame_equal(
            df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False
        )
        tm.assert_frame_equal(
            df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False
        )

        # When mixed types are passed and the ints are not level
        # names, raise
        msg = (
            "level should contain all level names or all level numbers, not "
            "a mixture of the two"
        )
        with pytest.raises(ValueError, match=msg):
            df2.stack(level=["animal", 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ["exp", "animal", 0]
        tm.assert_frame_equal(
            df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False
        )
Exemple #28
0
    def test_stack_datetime_column_multiIndex(self):
        # GH 8039
        t = datetime(2014, 1, 1)
        df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
        result = df.stack()

        eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
        ecols = MultiIndex.from_tuples([(t, "A")])
        expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
        tm.assert_frame_equal(result, expected)
    def test_stack_datetime_column_multiIndex(self):
        # GH 8039
        t = datetime(2014, 1, 1)
        df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
        result = df.stack()

        eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
        ecols = MultiIndex.from_tuples([(t, "A")])
        expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
        assert_frame_equal(result, expected)
Exemple #30
0
 def test_stack_preserve_categorical_dtype_values(self):
     # GH-23077
     cat = pd.Categorical(["a", "a", "b", "c"])
     df = DataFrame({"A": cat, "B": cat})
     result = df.stack()
     index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]])
     expected = Series(pd.Categorical(
         ["a", "a", "a", "a", "b", "b", "c", "c"]),
                       index=index)
     tm.assert_series_equal(result, expected)
Exemple #31
0
    def test_stack_partial_multiIndex(self):
        # GH 8844
        def _test_stack_with_multiindex(multiindex):
            df = DataFrame(np.arange(3 * len(multiindex)).reshape(
                3, len(multiindex)),
                           columns=multiindex)
            for level in (-1, 0, 1, [0, 1], [1, 0]):
                result = df.stack(level=level, dropna=False)

                if isinstance(level, int):
                    # Stacking a single level should not make any all-NaN rows,
                    # so df.stack(level=level, dropna=False) should be the same
                    # as df.stack(level=level, dropna=True).
                    expected = df.stack(level=level, dropna=True)
                    if isinstance(expected, Series):
                        assert_series_equal(result, expected)
                    else:
                        assert_frame_equal(result, expected)

                df.columns = MultiIndex.from_tuples(df.columns.get_values(),
                                                    names=df.columns.names)
                expected = df.stack(level=level, dropna=False)
                if isinstance(expected, Series):
                    assert_series_equal(result, expected)
                else:
                    assert_frame_equal(result, expected)

        full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
                                                  ('A', 'y'), ('C', 'x'),
                                                  ('C', 'u')],
                                                 names=['Upper', 'Lower'])
        for multiindex_columns in ([0, 1, 2, 3,
                                    4], [0, 1, 2, 3], [0, 1, 2,
                                                       4], [0, 1,
                                                            2], [1, 2,
                                                                 3], [2, 3, 4],
                                   [0, 1], [0, 2], [0, 3], [0], [2], [4]):
            _test_stack_with_multiindex(full_multiindex[multiindex_columns])
            if len(multiindex_columns) > 1:
                multiindex_columns.reverse()
                _test_stack_with_multiindex(
                    full_multiindex[multiindex_columns])

        df = DataFrame(np.arange(6).reshape(2, 3),
                       columns=full_multiindex[[0, 1, 3]])
        result = df.stack(dropna=False)
        expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]],
                             index=MultiIndex(levels=[[0, 1],
                                                      ['u', 'x', 'y', 'z']],
                                              labels=[[0, 0, 1, 1],
                                                      [1, 3, 1, 3]],
                                              names=[None, 'Lower']),
                             columns=Index(['B', 'C'], name='Upper'),
                             dtype=df.dtypes[0])
        assert_frame_equal(result, expected)
Exemple #32
0
def plot_probs(probs: DataFrame, weight):

    axf = AxesFormatter()
    data = probs.stack(
        level=['likelihood', 'prior']).rename('posterior').reset_index()
    boxplot(data=data, x='likelihood', y='posterior', hue='prior')
    axf.rotate_x_tick_labels(90)
    axf.set_y_lim(0, 1.05)
    axf.set_axis_below().grid()
    axf.set_text(title=str(weight))
    axf.show()
def R2(y_df: pd.DataFrame, y_hat_df: pd.DataFrame) -> float:
    '''
    R-squared OOS

    Args
    ----------
    y_df: pd.DataFrame
        actual test data OOS
    y_hat_df: pd.DataFrame
        predicted values OOS
    
    Returns
    ----------
    R2: float
    '''
    residuals = y_df - y_hat_df
    SSR = (residuals**2).stack().sum()
    SST = y_df.stack().var() * (len(y_df.stack()) - 1)
    R2 = 1 - SSR / SST
    return R2
def correct_pvalues(df: pd.DataFrame) -> pd.DataFrame:
    """
    Performs Bonferroni Correction
    :param df:
    :return:
    """
    df = df.stack()
    pvalues = multipletests(df, method='bonferroni')[1]
    df = pd.Series(pvalues, index=df.index)
    df = df.unstack()

    return df
Exemple #35
0
 def test_stack_multi_columns_non_unique_index(self, index, columns):
     # GH-28301
     df = DataFrame(index=index, columns=columns).fillna(1)
     stacked = df.stack()
     new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy())
     expected = DataFrame(stacked.to_numpy(),
                          index=new_index,
                          columns=stacked.columns)
     tm.assert_frame_equal(stacked, expected)
     stacked_codes = np.asarray(stacked.index.codes)
     expected_codes = np.asarray(new_index.codes)
     tm.assert_numpy_array_equal(stacked_codes, expected_codes)
        def _test_stack_with_multiindex(multiindex):
            df = DataFrame(np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex)
            for level in (-1, 0, 1, [0, 1], [1, 0]):
                result = df.stack(level=level, dropna=False)

                if isinstance(level, int):
                    # Stacking a single level should not make any all-NaN rows,
                    # so df.stack(level=level, dropna=False) should be the same
                    # as df.stack(level=level, dropna=True).
                    expected = df.stack(level=level, dropna=True)
                    if isinstance(expected, Series):
                        assert_series_equal(result, expected)
                    else:
                        assert_frame_equal(result, expected)

                df.columns = MultiIndex.from_tuples(df.columns.get_values(), names=df.columns.names)
                expected = df.stack(level=level, dropna=False)
                if isinstance(expected, Series):
                    assert_series_equal(result, expected)
                else:
                    assert_frame_equal(result, expected)
Exemple #37
0
    def test_stack_preserve_categorical_dtype(self, ordered, labels):
        # GH13854
        cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered)
        df = DataFrame([[10, 11, 12]], columns=cidx)
        result = df.stack()

        # `MultiIndex.from_product` preserves categorical dtype -
        # it's tested elsewhere.
        midx = pd.MultiIndex.from_product([df.index, cidx])
        expected = Series([10, 11, 12], index=midx)

        tm.assert_series_equal(result, expected)
Exemple #38
0
    def test_stack_int_level_names(self):
        columns = MultiIndex.from_tuples(
            [
                ("A", "cat", "long"),
                ("B", "cat", "long"),
                ("A", "dog", "short"),
                ("B", "dog", "short"),
            ],
            names=["exp", "animal", "hair_length"],
        )
        df = DataFrame(np.random.randn(4, 4), columns=columns)

        exp_animal_stacked = df.stack(level=["exp", "animal"])
        animal_hair_stacked = df.stack(level=["animal", "hair_length"])
        exp_hair_stacked = df.stack(level=["exp", "hair_length"])

        df2 = df.copy()
        df2.columns.names = [0, 1, 2]
        tm.assert_frame_equal(df2.stack(level=[1, 2]),
                              animal_hair_stacked,
                              check_names=False)
        tm.assert_frame_equal(df2.stack(level=[0, 1]),
                              exp_animal_stacked,
                              check_names=False)
        tm.assert_frame_equal(df2.stack(level=[0, 2]),
                              exp_hair_stacked,
                              check_names=False)

        # Out-of-order int column names
        df3 = df.copy()
        df3.columns.names = [2, 0, 1]
        tm.assert_frame_equal(df3.stack(level=[0, 1]),
                              animal_hair_stacked,
                              check_names=False)
        tm.assert_frame_equal(df3.stack(level=[2, 0]),
                              exp_animal_stacked,
                              check_names=False)
        tm.assert_frame_equal(df3.stack(level=[2, 1]),
                              exp_hair_stacked,
                              check_names=False)
Exemple #39
0
    def test_stack_preserve_categorical_dtype(self, ordered, labels):
        # GH13854
        cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
                                   ordered=ordered)
        df = DataFrame([[10, 11, 12]], columns=cidx)
        result = df.stack()

        # `MutliIndex.from_product` preserves categorical dtype -
        # it's tested elsewhere.
        midx = pd.MultiIndex.from_product([df.index, cidx])
        expected = Series([10, 11, 12], index=midx)

        tm.assert_series_equal(result, expected)
    def test_stack_int_level_names(self):
        columns = MultiIndex.from_tuples(
            [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")],
            names=["exp", "animal", "hair_length"],
        )
        df = DataFrame(randn(4, 4), columns=columns)

        exp_animal_stacked = df.stack(level=["exp", "animal"])
        animal_hair_stacked = df.stack(level=["animal", "hair_length"])
        exp_hair_stacked = df.stack(level=["exp", "hair_length"])

        df2 = df.copy()
        df2.columns.names = [0, 1, 2]
        assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False)

        # Out-of-order int column names
        df3 = df.copy()
        df3.columns.names = [2, 0, 1]
        assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False)
        assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False)
        assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
Exemple #41
0
    def test_stack_mixed_level(self):
        # GH 18310
        levels = [range(3), [3, 'a', 'b'], [1, 2]]

        # flat columns:
        df = DataFrame(1, index=levels[0], columns=levels[1])
        result = df.stack()
        expected = Series(1, index=MultiIndex.from_product(levels[:2]))
        assert_series_equal(result, expected)

        # MultiIndex columns:
        df = DataFrame(1, index=levels[0],
                       columns=MultiIndex.from_product(levels[1:]))
        result = df.stack(1)
        expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
                                                               levels[2]]),
                             columns=levels[1])
        assert_frame_equal(result, expected)

        # as above, but used labels in level are actually of homogeneous type
        result = df[['a', 'b']].stack(1)
        expected = expected[['a', 'b']]
        assert_frame_equal(result, expected)
Exemple #42
0
    def test_compute_forward_returns(self):
        dr = date_range(start='2015-1-1', end='2015-1-3')
        prices = DataFrame(index=dr, columns=['A', 'B'],
                           data=[[1, 1], [1, 2], [2, 1]])
        factor = prices.stack()

        fp = compute_forward_returns(factor, prices, periods=[1, 2])

        ix = MultiIndex.from_product([dr, ['A', 'B']],
                                     names=['date', 'asset'])
        expected = DataFrame(index=ix, columns=['1D', '2D'])
        expected['1D'] = [0., 1., 1., -0.5, nan, nan]
        expected['2D'] = [1., 0., nan, nan, nan, nan]

        assert_frame_equal(fp, expected)
 def write(self, arctic_lib, version, symbol, item, previous_version):
     if np.product(item.shape) == 0:
         # Currently not supporting zero size panels as they drop indices when converting to dataframes
         # Plan is to find a better solution in due course.
         raise ValueError('Cannot insert a zero size panel into mongo.')
     if not np.all(len(i.names) == 1 for i in item.axes):
         raise ValueError('Cannot insert panels with multiindexes')
     item = item.to_frame()
     if len(set(item.dtypes)) == 1:
         # If all columns have the same dtype, we support non-string column names.
         # We know from above check that columns is not a multiindex.
         item = DataFrame(item.stack())
     elif item.columns.dtype != np.dtype('object'):
         raise ValueError('Cannot support non-object dtypes for columns')
     super(PandasPanelStore, self).write(arctic_lib, version, symbol, item, previous_version)
Exemple #44
0
def testDf2():
    ''' creates test dataframe '''
    data = {'int':[1,2,3], 'float':[1.5,2.5,3.5],
            'string':['a','b','c'], 'nan':[np.nan,np.nan,np.nan]}
    
    df = DataFrame(data, index=Index(['AAA','BBB','CCC']),
                 columns=['int','float','string','nan'])
    df.index.names = ['letters']
    

    df2 = DataFrame( { 'a' : [1,3] , 'b' : [2 ,4]} , index = ['one','two'] )  
    
    df2 = df2.stack()

    
    return df2 
def slide_9():
    data = pd.read_csv(MACRODATAPATH)
    periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
    data = DataFrame(data.to_records(),
                     columns=pd.Index(['realgdp', 'infl', 'unemp'],
                                      name='item'),
                     index=periods.to_timestamp('D', 'end'))

    ldata = data.stack().reset_index().rename(columns={0: 'value'})
    wdata = ldata.pivot('date', 'item', 'value')
    print ldata[:10]
    pivoted = ldata.pivot('date', 'item', 'value')
    print pivoted.head()

    ldata['value2'] = np.random.randn(len(ldata))
    print ldata[:10]

    pivoted = ldata.pivot('date', 'item')
    print pivoted[:5]
    print pivoted['value'][:5]

    unstacked = ldata.set_index(['date', 'item']).unstack('item')
    print unstacked[:7]
def slide_8():
    data = DataFrame(np.arange(6).reshape((2, 3)),
                     index=pd.Index(['Ohio', 'Colorado'], name='state'),
                     columns=pd.Index(['one', 'two', 'three'], name='number'))
    print data
    result = data.stack()
    print '***stack()***'
    print result
    print '***unstack()***'
    print result.unstack()

    print '***unstack(0)***'
    print result.unstack(0)

    print "***unstack('state')***"
    print result.unstack('state')

    s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
    s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
    data2 = pd.concat([s1, s2], keys=['one', 'two'])
    print '***unstack***'
    print data2.unstack()
    print '***unstack->stack***'
    print data2.unstack().stack()
    print '***unstack->stack(dropna)***'
    print data2.unstack().stack(dropna=False)

    df = DataFrame({'left': result, 'right': result + 5},
                   columns=pd.Index(['left', 'right'],
                   name='side'))
    print 'df'
    print df

    print "unstack('state')"
    print df.unstack('state')
    print "unstack('state').stack('side')"
    print df.unstack('state').stack('side')
Exemple #47
0
    def distance_map(self, within, between, metric='wminkowski', p=2.0):
        r"""Create a distance map from the current electrode configuration.

        This method performs some type checking on its arguments.

        Parameters
        ----------
        within, between : number
            `between` is the distance between shanks and `within` is the
            distance between electrodes on any given shank.

        metric : str or callable, optional
            Metric to use to calculate the distance between electrodes/shanks.
            Defaults to a weighted Minkowski distance

        p : numbers.Real, optional
            The :math:`p` of the norm to use. Defaults to 2.0 for weighted
            Euclidean distance.

        Notes
        -----
        The default `metric` of ``'wminkowski'`` and the default `p` of ``2.0``
        combine to give a weighted Euclidean distance metric. The weighted
        Minkowski distance between two points
        :math:`\mathbf{x},\mathbf{y}\in\mathbb{R}^{n}`, and a weight vector
        :math:`\mathbf{w}\in\mathbb{R}^{n}` is given by

            .. math::
               \left(\sum_{i=1}^{n}w_i\left|x_i-y_i\right|^{p}\right)^{1/p}

        Raises
        ------
        AssertionError
            * If `within` is not an instance of ``numbers.Real``
            * If `between` is not an instance of ``numbers.Real``
            * If `p` is not an instance of ``numbers.Real``
            * If metric is not an instance of ``basestring`` or a callable

        Returns
        -------
        df : DataFrame
            A dataframe with pairwise distances between electrodes, indexed by
            channel, shank.
        """
        assert isinstance(within, numbers.Real) and within > 0, \
            '"within" must be a positive real number'
        assert isinstance(between, numbers.Real) and between > 0, \
            '"between" must be a positive real number'
        assert isinstance(metric, basestring) or callable(metric), \
            '"metric" must be a callable object or a string'
        assert isinstance(p, numbers.Real) and p > 0, \
            'p must be a real number greater than 0'

        dm = distance_map(self.nshanks, self.shank.nunique(), within, between,
                          metric=metric, p=p)
        s = self.sort()
        cols = s.index, s.shank

        values_getter = operator.attrgetter('values')
        cols = tuple(map(values_getter, cols))
        names = 'channel', 'shank'

        def _label_maker(i, names):
            new_names = tuple(map(lambda x: x + ' %s' % i, names))
            return MultiIndex.from_arrays(cols, names=new_names)

        index = _label_maker('i', names)
        columns = _label_maker('j', names)
        df = DataFrame(dm, index=index, columns=columns)

        nnames = len(names)
        ninds = len(index)
        nlevels = nnames * ninds

        zipped = zip(xrange(nnames), xrange(nnames, nlevels))
        reordering = tuple(reduce(operator.add, zipped))

        s = df.stack(0)

        for _ in xrange(nnames - 1):
            s = s.stack(0)

        s.name = r'$d\left(i, j\right)$'

        return s.reorder_levels(reordering)
Exemple #48
0
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)


###重塑层次化索引
#1
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
data

result = data.stack()
result

result.unstack()

result.unstack(0)

result.unstack('state')

#2
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2.unstack()

data2.unstack().stack()
###############################################################
###															###
###															###
###			   				PIVOTING						###
###															###
###															###
###############################################################

## stacking and unstacking a data frame

df1 = DataFrame(np.arange(8).reshape(2,4),
               index = pd.Index(['LA','SF'],name='city'),
               columns= pd.Index(['A','B','C','D'], name = 'letters')) # pd.Index enables naming of the columns or index

df_st = df1.stack() # pivots rows into columns

df_st.unstack() # unpivots the above operation

df_st.unstack('city') # will ensure that 'city' are the columns

# to go from long data frame to wide data frame, we can use the pivot function
# the pivot function is also useful in the excel pivot kind of way
dframe.pivot_table(index=['zone'], columns=['Stories','homebath'], values=['homeprice'], aggfunc='mean')

# cross tab frequency of occurences
pd.crosstab(dframe.homebath, dframe.homebr, margins = True)

###############################################################
###															###
###															###
Exemple #50
0
class ExcelSheet():
    def __init__(self,
                 connection,
                 sheet,
                 sql,
                 stack=False,
                 icol=False):

        self.sheet = sheet
        self.connection = connection
        self.recordset = Dispatch('ADODB.Recordset')
        self.recordset.Open(sql, self.connection, 0, 1)
        self.stack = stack
        # self.sd = SD
        self.icol = icol
        self.df = None
        print('  |--connect to {0}'.format(self.sheet))

    def column_dates(self):
        # current_month_days
        cmd = monthrange(SD.year, SD.month + 1)
        # current_month_last_day
        cmld = date(SD.year, SD.month + 1, cmd[1])

        return [SD + timedelta(days=i) for i in range(cmd[1])] + \
            [cmld - timedelta(days=1) for j in
             range((len(self.recordset.Fields)-cmd[1]-3))]

    def column_names(self):
        if self.stack:
            return ['产品图号', '产品类别'] + \
                self.column_dates() + ['合计']
        else:
            return [field.Name for field in self.recordset.Fields]

    def data(self):
        self.df = DataFrame(data=list(self.recordset.GetRows()))
        self.df = self.df.T
        self.df.columns = self.column_names()

        if self.stack:
            self.df = self.df.set_index(['产品图号', '工序'])
        else:
            self.df = self.df.set_index('产品图号')

        if self.sheet == '东海.外协':
            col_names = ['产品类别', '外协盘存', '本月出库',
                         '本月入库', '本月结存'] + \
                         [SD + timedelta(days=i)
                          for i in range(31)] + ['合计'] + \
                         [SD + timedelta(days=i)
                          for i in range(31)] + ['合计1']
            self.df.columns = col_names
            del self.df['产品类别']
            dd = self.df.iloc[:, 0:1]
            # dd.columns = ['外协盘存']
            dd = dd[dd['外协盘存'] > 0]
            dd['外协盘存'] = dd['外协盘存'].astype(int)
            dd.to_sql(name='东海.外协盘点', con=CONN,
                      flavor='sqlite', if_exists='replace')

            del self.df['合计']
            del self.df['合计1']
            del self.df['外协盘存']
            del self.df['本月出库']
            del self.df['本月入库']
            del self.df['本月结存']

            df1 = self.df.iloc[:, 0:31]
            df1 = df1.stack()
            df1.index.names = ['产品图号', '日期']
            df1 = df1.to_frame()
            df1.columns = ['数量']
            df1.insert(loc=1, column='工序', value='W1TO')

            df2 = self.df.iloc[:, 0:31]
            df2 = df2.stack()
            df2.index.names = ['产品图号', '日期']
            df2 = df2.to_frame()
            df2.columns = ['数量']
            df2.insert(loc=1, column='工序', value='WOT1')

            self.df = DataFrame(concat([df1, df2]))
            self.df['数量'] = self.df['数量'].astype(int)
            self.df = self.df[self.df['数量'] != 0]

        self.df = self.conditions()

        return self.df

    def conditions(self):
        if self.stack:
            for cn in self.df.columns:
                if cn in ['直径', '长度', '成品长度', '产品类别', '合计']:
                    del self.df[cn]
            self.df = self.df.stack()
            self.df.index.names = ['产品图号', '日期']
            self.df = self.df.to_frame()
            self.df.columns = ['数量']
            self.df['数量'] = self.df['数量'].astype(int)
            self.df = self.df[self.df['数量'] != 0]

        if self.icol:
            self.df.insert(loc=1, column='工序', value=self.icol)

        for cn in self.df.columns:
            if cn in ['W3盘存', 'W2盘存', 'W4盘存']:
                self.df[cn] = self.df[cn].astype(int)

        if self.sheet in ['一部.工序监控', 'G加.工序数据']:
            self.df['日期'] = self.df['日期'].apply(convert_data)
            self.df['日期'] = self.df['日期'].dt.date
            self.df['数量'] = self.df['数量'].astype(int)

        if self.sheet == 'G加.工序数据':
            self.df['工序'] = self.df['工序'].replace(
                ['粗磨', '淬火', '回火', '半中磨', '精车',
                 '镀前磨削', '电镀', '镀后', 'GP12检验'],
                ['W4粗磨', 'W4淬火', 'W4回火', 'W4中磨', 'W4精车',
                 'W4镀前', 'W4电镀', 'W4镀后', 'W4GP12'])
            self.df = self.df[self.df['工序'].str.startswith('W4')]

        return self.df

    def to_db(self):
        self.df.to_sql(name=self.sheet, con=CONN,
                       flavor='sqlite', if_exists='replace')

    def close(self):
        try:
            self.recordset.Close()
            del self.recordset
        except:
            pass

    def __del__(self):
        # self.close()
        pass
Exemple #51
0
class TestMultiLevel(unittest.TestCase):

    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=Index(['A', 'B', 'C'], name='exp'))

        self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                       labels=[[0, 1, 2, 3]],
                                       names=['first'])

        # create test series object
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype('i8')
                                 for lev in self.ymd.index.levels]
        self.ymd.index.names = ['year', 'month', 'day']

    def test_append(self):
        a, b = self.frame[:5], self.frame[5:]

        result = a.append(b)
        tm.assert_frame_equal(result, self.frame)

        result = a['A'].append(b['A'])
        tm.assert_series_equal(result, self.frame['A'])

    def test_reindex_level(self):
        # axis=0
        month_sums = self.ymd.sum(level='month')
        result = month_sums.reindex(self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum)

        assert_frame_equal(result, expected)

        # Series
        result = month_sums['A'].reindex(self.ymd.index, level=1)
        expected = self.ymd['A'].groupby(level='month').transform(np.sum)
        assert_series_equal(result, expected)

        # axis=1
        month_sums = self.ymd.T.sum(axis=1, level='month')
        result = month_sums.reindex(columns=self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum).T
        assert_frame_equal(result, expected)

    def test_binops_level(self):
        def _check_op(opname):
            op = getattr(DataFrame, opname)
            month_sums = self.ymd.sum(level='month')
            result = op(self.ymd, month_sums, level='month')
            broadcasted = self.ymd.groupby(level='month').transform(np.sum)
            expected = op(self.ymd, broadcasted)
            assert_frame_equal(result, expected)

            # Series
            op = getattr(Series, opname)
            result = op(self.ymd['A'], month_sums['A'], level='month')
            broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum)
            expected = op(self.ymd['A'], broadcasted)
            assert_series_equal(result, expected)

        _check_op('sub')
        _check_op('add')
        _check_op('mul')
        _check_op('div')

    def test_pickle(self):
        import cPickle
        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_sort_index_preserve_levels(self):
        result = self.frame.sort_index()
        self.assertEquals(result.index.names, self.frame.index.names)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s[42:65]
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s[49:51]
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s[42:65]).all())
        self.assert_(notnull(s[:42]).all())
        self.assert_(notnull(s[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_fancy_2d(self):
        result = self.frame.ix['foo', 'B']
        expected = self.frame.xs('foo')['B']
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix['B', 'foo']
        expected = ft.xs('B')['foo']
        assert_series_equal(result, expected)

    def test_get_loc_single_level(self):
        s = Series(np.random.randn(len(self.single_level)),
                   index=self.single_level)
        for k in self.single_level.values:
            s[k]

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]],
                           labels=[[0, 0, 1, 1, 2, 2],
                                   [0, 1, 0, 1, 0, 1]])

        frame =  DataFrame(np.random.randn(len(index), 4), index=index,
                           columns=['a', 'b', 'c', 'd'])
        res = frame.ix[1:2]
        exp = frame[2:]
        assert_frame_equal(res, exp)

        series =  Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series[2:]
        assert_series_equal(res, exp)

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_getitem_slice_not_sorted(self):
        df = self.frame.sortlevel(1).T

        # buglet with int typechecking
        result = df.ix[:, :np.int32(3)]
        expected = df.reindex(columns=df.columns[:3])
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_frame_setitem_ix(self):
        self.frame.ix[('bar', 'two'), 'B'] = 5
        self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5)

        # with integer labels
        df = self.frame.copy()
        df.columns = range(3)
        df.ix[('bar', 'two'), 1] = 7
        self.assertEquals(df.ix[('bar', 'two'), 1], 7)

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000,2):(2000,4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception,
                          self.frame.reset_index()['A'].sortlevel)

        # preserve names
        self.assertEquals(a_sorted.index.names, self.frame.index.names)

    def test_delevel_infer_dtype(self):
        tuples = [tuple for tuple in cart_product(['foo', 'bar'],
                                                  [10, 20], [1.0, 1.1])]
        index = MultiIndex.from_tuples(tuples,
                                       names=['prm0', 'prm1', 'prm2'])
        df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'],
                       index=index)
        deleveled = df.reset_index()
        self.assert_(com.is_integer_dtype(deleveled['prm1']))
        self.assert_(com.is_float_dtype(deleveled['prm2']))

    def test_sortlevel_by_name(self):
        self.frame.index.names = ['first', 'second']
        result = self.frame.sortlevel(level='second')
        expected = self.frame.sortlevel(level=1)
        assert_frame_equal(result, expected)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)
                expected = expected.reindex_like(result).astype('i8')
                assert_frame_equal(result, expected)

        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan
        self.ymd.ix[1, [1, 2]] = np.nan
        self.ymd.ix[7, [0, 1]] = np.nan

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

        self.frame['D'] = 'foo'
        result = self.frame.count(level=0, numeric_only=True)
        assert_almost_equal(result.columns, ['A', 'B', 'C'])

    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2],
                                   [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

    def test_count_level_corner(self):
        s = self.frame['A'][:0]
        result = s.count(level=0)
        expected = Series(0, index=s.index.levels[0])
        assert_series_equal(result, expected)

        df = self.frame[:0]
        result = df.count(level=0)
        expected = DataFrame({}, index=s.index.levels[0],
                             columns=df.columns).fillna(0).astype(int)
        assert_frame_equal(result, expected)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

        # stack with negative number
        result = self.ymd.unstack(0).stack(-2)
        expected = self.ymd.unstack(0).stack(0)

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked['foo'], df['foo'].stack())
        self.assert_(stacked['bar'].dtype == np.float_)

    def test_unstack_bug(self):
        df = DataFrame({'state': ['naive','naive','naive',
                                  'activ','activ','activ'],
                        'exp':['a','b','b','b','a','a'],
                        'barcode':[1,2,3,4,1,3],
                        'v':['hi','hi','bye','bye','bye','peace'],
                        'extra': np.arange(6.)})

        result = df.groupby(['state','exp','barcode','v']).apply(len)
        unstacked = result.unstack()
        restacked = unstacked.stack()
        assert_series_equal(restacked,
                            result.reindex(restacked.index).astype(float))

    def test_stack_unstack_preserve_names(self):
        unstacked = self.frame.unstack()
        self.assertEquals(unstacked.index.name, 'first')
        self.assertEquals(unstacked.columns.names, ['exp', 'second'])

        restacked = unstacked.stack()
        self.assertEquals(restacked.index.names, self.frame.index.names)

    def test_unstack_level_name(self):
        result = self.frame.unstack('second')
        expected = self.frame.unstack(level=1)
        assert_frame_equal(result, expected)

    def test_stack_level_name(self):
        unstacked = self.frame.unstack('second')
        result = unstacked.stack('exp')
        expected = self.frame.unstack().stack(0)
        assert_frame_equal(result, expected)

        result = self.frame.stack('exp')
        expected = self.frame.stack()
        assert_series_equal(result, expected)

    def test_stack_unstack_multiple(self):
        unstacked = self.ymd.unstack(['year', 'month'])
        expected = self.ymd.unstack('year').unstack('month')
        assert_frame_equal(unstacked, expected)
        self.assertEquals(unstacked.columns.names,
                          expected.columns.names)

        # series
        s = self.ymd['A']
        s_unstacked = s.unstack(['year', 'month'])
        assert_frame_equal(s_unstacked, expected['A'])

        restacked = unstacked.stack(['year', 'month'])
        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
        restacked = restacked.sortlevel(0)

        assert_frame_equal(restacked, self.ymd)
        self.assertEquals(restacked.index.names, self.ymd.index.names)

        # GH #451
        unstacked = self.ymd.unstack([1, 2])
        expected = self.ymd.unstack(1).unstack(1)
        assert_frame_equal(unstacked, expected)

        unstacked = self.ymd.unstack([2, 1])
        expected = self.ymd.unstack(2).unstack(1)
        assert_frame_equal(unstacked, expected)

    def test_groupby_transform(self):
        s = self.frame['A']
        grouper = s.index.get_level_values(0)

        grouped = s.groupby(grouper)

        applied = grouped.apply(lambda x: x * 2)
        expected = grouped.transform(lambda x: x * 2)
        assert_series_equal(applied.reindex(expected.index), expected)

    def test_join(self):
        a = self.frame.ix[:5, ['A']]
        b = self.frame.ix[2:, ['B', 'C']]

        joined = a.join(b, how='outer').reindex(self.frame.index)
        expected = self.frame.copy()
        expected.values[np.isnan(joined.values)] = np.nan

        self.assert_(not np.isnan(joined.values).all())

        assert_frame_equal(joined, expected)

    def test_swaplevel(self):
        swapped = self.frame['A'].swaplevel(0, 1)
        swapped2 = self.frame['A'].swaplevel('first', 'second')
        self.assert_(not swapped.index.equals(self.frame.index))
        assert_series_equal(swapped, swapped2)

        back = swapped.swaplevel(0, 1)
        back2 = swapped.swaplevel('second', 'first')
        self.assert_(back.index.equals(self.frame.index))
        assert_series_equal(back, back2)

        ft = self.frame.T
        swapped = ft.swaplevel('first', 'second', axis=1)
        exp = self.frame.swaplevel('first', 'second').T
        assert_frame_equal(swapped, exp)

    def test_swaplevel_panel(self):
        panel = Panel({'ItemA' : self.frame,
                       'ItemB' : self.frame * 2})

        result = panel.swaplevel(0, 1, axis='major')
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)

    def test_reorder_levels(self):
        result = self.ymd.reorder_levels(['month', 'day', 'year'])
        expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
        assert_frame_equal(result, expected)

        result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
        expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
        assert_series_equal(result, expected)

        result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
        expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
        assert_frame_equal(result, expected)

        self.assertRaises(Exception, self.ymd.index.reorder_levels,
                          [1, 2, 3])

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        x = Series(data=[1,2,3],
                   index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)]))

        y = Series(data=[4,5,6],
                   index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 1, 0, 1, 1],
                                   [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df['foo'].values[:] = 0
        self.assert_((df['foo'].values == 0).all())

        # but not if it's mixed-type
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(0, axis=1)
        df['foo']['one'] = 2
        self.assert_((df['foo', 'one'] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df['foo']
        result2 = df.ix[:, 'foo']
        expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs('foo')
        result2 = df.ix['foo']
        expected = df.reindex(df.index[arrays[0] == 'foo'])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
        ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s['qux']
        result2 = s.ix['qux']
        expected = s[arrays[0] == 'qux']
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

    AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
                     'mad', 'std', 'var']

    def test_series_group_min_max(self):
        for op, level, skipna in cart_product(self.AGG_FUNCTIONS,
                                              range(2),
                                              [False, True]):
            grouped = self.series.groupby(level=level)
            aggf = lambda x: getattr(x, op)(skipna=skipna)
            # skipna=True
            leftside = grouped.agg(aggf)
            rightside = getattr(self.series, op)(level=level, skipna=skipna)
            assert_series_equal(leftside, rightside)

    def test_frame_group_ops(self):
        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan

        for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
                                                    range(2), range(2),
                                                    [False, True]):
            if axis == 0:
                frame = self.frame
            else:
                frame = self.frame.T

            grouped = frame.groupby(level=level, axis=axis)

            aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis)
            leftside = grouped.agg(aggf)
            rightside = getattr(frame, op)(level=level, axis=axis,
                                           skipna=skipna)

            # for good measure, groupby detail
            level_index = frame._get_axis(axis).levels[level]

            self.assert_(leftside._get_axis(axis).equals(level_index))
            self.assert_(rightside._get_axis(axis).equals(level_index))

            assert_frame_equal(leftside, rightside)

    def test_frame_series_agg_multiple_levels(self):
        result = self.ymd.sum(level=['year', 'month'])
        expected = self.ymd.groupby(level=['year', 'month']).sum()
        assert_frame_equal(result, expected)

        result = self.ymd['A'].sum(level=['year', 'month'])
        expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
        assert_series_equal(result, expected)

    def test_groupby_multilevel(self):
        result = self.ymd.groupby(level=[0, 1]).mean()

        k1 = self.ymd.index.get_level_values(0)
        k2 = self.ymd.index.get_level_values(1)

        expected = self.ymd.groupby([k1, k2]).mean()

        assert_frame_equal(result, expected)
        self.assertEquals(result.index.names, self.ymd.index.names[:2])

        result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
        assert_frame_equal(result, result2)

    def test_groupby_multilevel_with_transform(self):
        pass

    def test_multilevel_consolidate(self):
        index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                        ('bar', 'one'), ('bar', 'two')])
        df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
        df['Totals', ''] = df.sum(1)
        df = df.consolidate()

    def test_ix_preserve_names(self):
        result = self.ymd.ix[2000]
        result2 = self.ymd['A'].ix[2000]
        self.assertEquals(result.index.names, self.ymd.index.names[1:])
        self.assertEquals(result2.index.names, self.ymd.index.names[1:])

        result = self.ymd.ix[2000, 2]
        result2 = self.ymd['A'].ix[2000, 2]
        self.assertEquals(result.index.name, self.ymd.index.names[2])
        self.assertEquals(result2.index.name, self.ymd.index.names[2])

    def test_partial_set(self):
        # GH #397
        df = self.ymd.copy()
        exp = self.ymd.copy()
        df.ix[2000, 4] = 0
        exp.ix[2000, 4].values[:] = 0
        assert_frame_equal(df, exp)

        df['A'].ix[2000, 4] = 1
        exp['A'].ix[2000, 4].values[:] = 1
        assert_frame_equal(df, exp)

        df.ix[2000] = 5
        exp.ix[2000].values[:] = 5
        assert_frame_equal(df, exp)

        # this works...for now
        df['A'].ix[14] = 5
        self.assertEquals(df['A'][14], 5)

    def test_unstack_preserve_types(self):
        # GH #403
        self.ymd['E'] = 'foo'
        self.ymd['F'] = 2

        unstacked = self.ymd.unstack('month')
        self.assert_(unstacked['A', 1].dtype == np.float64)
        self.assert_(unstacked['E', 1].dtype == np.object_)
        self.assert_(unstacked['F', 1].dtype == np.float64)

    def test_partial_ix_missing(self):
        result = self.ymd.ix[2000, 0]
        expected = self.ymd.ix[2000]['A']
        assert_series_equal(result, expected)

        # need to put in some work here

        # self.ymd.ix[2000, 0] = 0
        # self.assert_((self.ymd.ix[2000]['A'] == 0).all())

        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6))
        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0)

    def test_to_html(self):
        self.ymd.columns.name = 'foo'
        self.ymd.to_html()
        self.ymd.T.to_html()
mergeL = pd.merge(employee, sales, on = 'ID', how = 'left')       # A Left inner Join
mergeR = pd.merge(employee, sales, on = 'ID', how = 'right')      # A Right inner Join
mergeO = pd.merge(employee, sales, on = 'ID', how = 'outer')      # An Outer Join
mergeM = pd.merge(sales, bonus, on = 'ID')                        # A many-to-many Join
stack = pd.concat([employee, sales], ignore_index = True)         # Vertical Stacking

#############################################################################################################
# 8. Reshaping & Pivoting
#############################################################################################################
df1 = DataFrame([['Big','LAX',3,np.nan],['Big','SFO',6,7],['Med','SEA-TAC',9,np.nan],['Small','POR',np.nan,np.nan]],
                 index=pd.Index(['LA', 'SF', 'SEA', 'POR']),
                 columns=pd.Index(['Type', 'Airport', 'Cool Factor','D']))

# .unstack(): used to convert columns into rows and into a hierarchical index 
df2 = df1.stack(dropna = False)                    # converts columns into the child index
df3 = df1.unstack()                                # converts columns into the parent index 

# .pivot(index, columns, values) is used to reshape data like dplyr in R
df4 = df1.pivot('Airport','Type','Cool Factor')    # yes! its that easy to reshape!

#############################################################################################################
# 9. Outlier Analysis
#############################################################################################################
np.random.seed(12345)
df = DataFrame(np.random.randn(1000,4))
df.describe()                                        # assume outliers are in the -+3 region

df[0][np.abs(df[0])>3]                               # show all rows in column 0 that are > abs(3)
df[(np.abs(df)>3).any(1)]                            # show all values in the dataframe that are > abs(3)
df[np.abs(df)>3] = np.sign(df) * 3                   # caps all values > abs(3) to 3; .sign()                                
Exemple #53
0
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import datetime

data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)

result = data.stack()
print(result)
print(result.unstack())
print(result.unstack(0))
print(result.unstack('state'))

s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])

data2 = pd.concat([s1, s2], keys=['one', 'two'])

print(data2)
print(data2.unstack())
print(data2.unstack().stack())
print(data2.unstack().stack(dropna=False))

df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
print(df)
print(df.unstack('state'))
print(df.unstack('state').stack('side'))
Exemple #54
0
class TestMultiLevel(unittest.TestCase):

    def setUp(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.frame = DataFrame(np.random.randn(10, 3), index=index,
                               columns=Index(['A', 'B', 'C'], name='exp'))

        self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
                                       labels=[[0, 1, 2, 3]],
                                       names=['first'])

        # create test series object
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month,
                                     lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype('i8')
                                 for lev in self.ymd.index.levels]
        self.ymd.index.names = ['year', 'month', 'day']

    def test_append(self):
        a, b = self.frame[:5], self.frame[5:]

        result = a.append(b)
        tm.assert_frame_equal(result, self.frame)

        result = a['A'].append(b['A'])
        tm.assert_series_equal(result, self.frame['A'])

    def test_dataframe_constructor(self):
        multi = DataFrame(np.random.randn(4, 4),
                          index=[np.array(['a', 'a', 'b', 'b']),
                                 np.array(['x', 'y', 'x', 'y'])])
        self.assert_(isinstance(multi.index, MultiIndex))
        self.assert_(not isinstance(multi.columns, MultiIndex))

        multi = DataFrame(np.random.randn(4, 4),
                          columns=[['a', 'a', 'b', 'b'],
                                   ['x', 'y', 'x', 'y']])
        self.assert_(isinstance(multi.columns, MultiIndex))

    def test_series_constructor(self):
        multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']),
                                  np.array(['x', 'y', 'x', 'y'])])
        self.assert_(isinstance(multi.index, MultiIndex))

        multi = Series(1., index=[['a', 'a', 'b', 'b'],
                                  ['x', 'y', 'x', 'y']])
        self.assert_(isinstance(multi.index, MultiIndex))

        multi = Series(range(4), index=[['a', 'a', 'b', 'b'],
                                        ['x', 'y', 'x', 'y']])
        self.assert_(isinstance(multi.index, MultiIndex))

    def test_reindex_level(self):
        # axis=0
        month_sums = self.ymd.sum(level='month')
        result = month_sums.reindex(self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum)

        assert_frame_equal(result, expected)

        # Series
        result = month_sums['A'].reindex(self.ymd.index, level=1)
        expected = self.ymd['A'].groupby(level='month').transform(np.sum)
        assert_series_equal(result, expected)

        # axis=1
        month_sums = self.ymd.T.sum(axis=1, level='month')
        result = month_sums.reindex(columns=self.ymd.index, level=1)
        expected = self.ymd.groupby(level='month').transform(np.sum).T
        assert_frame_equal(result, expected)

    def test_binops_level(self):
        def _check_op(opname):
            op = getattr(DataFrame, opname)
            month_sums = self.ymd.sum(level='month')
            result = op(self.ymd, month_sums, level='month')

            broadcasted = self.ymd.groupby(level='month').transform(np.sum)
            expected = op(self.ymd, broadcasted)
            assert_frame_equal(result, expected)

            # Series
            op = getattr(Series, opname)
            result = op(self.ymd['A'], month_sums['A'], level='month')
            broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum)
            expected = op(self.ymd['A'], broadcasted)
            assert_series_equal(result, expected)

        _check_op('sub')
        _check_op('add')
        _check_op('mul')
        _check_op('div')

    def test_pickle(self):
        import cPickle
        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_sort_index_preserve_levels(self):
        result = self.frame.sort_index()
        self.assertEquals(result.index.names, self.frame.index.names)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_repr_name_coincide(self):
        index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')],
                                       names=['a', 'b', 'c'])

        df = DataFrame({'value': [0, 1]}, index=index)

        lines = repr(df).split('\n')
        self.assert_(lines[2].startswith('a 0 foo'))

    def test_getitem_simple(self):
        df = self.frame.T

        col = df['foo', 'one']
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ('foo', 'four'))
        self.assertRaises(KeyError, df.__getitem__, 'foobar')

    def test_series_getitem(self):
        s = self.ymd['A']

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s.reindex(s.index[42:65])
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s.reindex(s.index[49:51])
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_getitem_corner(self):
        s = self.ymd['A']

        # don't segfault, GH #495
        # out of bounds access
        self.assertRaises(IndexError, s.__getitem__, len(self.ymd))

        # generator
        result = s[(x > 0 for x in s)]
        expected = s[s > 0]
        assert_series_equal(result, expected)

    def test_series_setitem(self):
        s = self.ymd['A']

        s[2000, 3] = np.nan
        self.assert_(isnull(s.values[42:65]).all())
        self.assert_(notnull(s.values[:42]).all())
        self.assert_(notnull(s.values[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_frame_getitem_setitem_slice(self):
        # getitem
        result = self.frame.ix[:4]
        expected = self.frame[:4]
        assert_frame_equal(result, expected)

        # setitem
        cp = self.frame.copy()
        cp.ix[:4] = 0

        self.assert_((cp.values[:4] == 0).all())
        self.assert_((cp.values[4:] != 0).all())

    def test_frame_getitem_setitem_multislice(self):
        levels = [['t1', 't2'], ['a','b','c']]
        labels = [[0,0,0,1,1], [0,1,2,0,1]]
        midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id'])
        df = DataFrame({'value':[1,2,3,7,8]}, index=midx)

        result = df.ix[:,'value']
        assert_series_equal(df['value'], result)

        result = df.ix[1:3,'value']
        assert_series_equal(df['value'][1:3], result)

        result = df.ix[:,:]
        assert_frame_equal(df, result)

        result = df
        df.ix[:, 'value'] = 10
        result['value'] = 10
        assert_frame_equal(df, result)

        df.ix[:,:] = 10
        assert_frame_equal(df, result)

    def test_getitem_tuple_plus_slice(self):
        # GH #671
        df = DataFrame({'a' : range(10),
                        'b' : range(10),
                        'c' : np.random.randn(10),
                        'd' : np.random.randn(10)})

        idf = df.set_index(['a', 'b'])

        result = idf.ix[(0, 0), :]
        expected = idf.ix[0, 0]
        expected2 = idf.xs((0, 0))

        assert_series_equal(result, expected)
        assert_series_equal(result, expected2)

    def test_getitem_setitem_tuple_plus_columns(self):
        # GH #1013

        df = self.ymd[:5]

        result = df.ix[(2000, 1, 6), ['A', 'B', 'C']]
        expected = df.ix[2000, 1, 6][['A', 'B', 'C']]
        assert_series_equal(result, expected)

    def test_xs(self):
        xs = self.frame.xs(('bar', 'two'))
        xs2 = self.frame.ix[('bar', 'two')]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs('foo')
        result2 = self.frame.ix['foo']
        expected = self.frame.T['foo'].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_xs_level(self):
        result = self.frame.xs('two', level='second')
        expected = self.frame[self.frame.index.get_level_values(1) == 'two']
        expected.index = expected.index.droplevel(1)

        assert_frame_equal(result, expected)

        index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'),
                                        ('p', 'q', 'r')])
        df = DataFrame(np.random.randn(3, 5), index=index)
        result = df.xs('c', level=2)
        expected = df[1:2]
        expected.index = expected.index.droplevel(2)
        assert_frame_equal(result, expected)

    def test_xs_level_multiple(self):
        from pandas import read_table
        from StringIO import StringIO
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep='\s+')

        result = df.xs(('a', 4), level=['one', 'four'])
        expected = df.xs('a').xs(4, level='four')
        assert_frame_equal(result, expected)

    def test_xs_level0(self):
        from pandas import read_table
        from StringIO import StringIO
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep='\s+')

        result = df.xs('a', level=0)
        expected = df.xs('a')
        self.assertEqual(len(result), 2)
        assert_frame_equal(result, expected)

    def test_xs_level_series(self):
        s = self.frame['A']
        result = s[:, 'two']
        expected = self.frame.xs('two', level=1)['A']
        assert_series_equal(result, expected)

        s = self.ymd['A']
        result = s[2000, 5]
        expected = self.ymd.ix[2000, 5]['A']
        assert_series_equal(result, expected)

        # not implementing this for now

        self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4)))

        # result = s[2000, 3:4]
        # lv =s.index.get_level_values(1)
        # expected = s[(lv == 3) | (lv == 4)]
        # expected.index = expected.index.droplevel(0)
        # assert_series_equal(result, expected)

        # can do this though

    def test_get_loc_single_level(self):
        s = Series(np.random.randn(len(self.single_level)),
                   index=self.single_level)
        for k in self.single_level.values:
            s[k]

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df['foo']
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df['bar']
        result2 = df.ix[:, 'bar']

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_setitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]],
                           labels=[[0, 0, 1, 1, 2, 2],
                                   [0, 1, 0, 1, 0, 1]])

        frame =  DataFrame(np.random.randn(len(index), 4), index=index,
                           columns=['a', 'b', 'c', 'd'])
        res = frame.ix[1:2]
        exp = frame.reindex(frame.index[2:])
        assert_frame_equal(res, exp)

        frame.ix[1:2] = 7
        self.assert_((frame.ix[1:2] == 7).values.all())

        series =  Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series.reindex(series.index[2:])
        assert_series_equal(res, exp)

        series.ix[1:2] = 7
        self.assert_((series.ix[1:2] == 7).values.all())

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_getitem_slice_not_sorted(self):
        df = self.frame.sortlevel(1).T

        # buglet with int typechecking
        result = df.ix[:, :np.int32(3)]
        expected = df.reindex(columns=df.columns[:3])
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft['foo', 'two']
        dft['foo', 'two'] = s > s.median()
        assert_series_equal(dft['foo', 'two'], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[('foo', 'two')])
        assert_series_equal(reindexed['foo', 'two'], s > s.median())

    def test_frame_setitem_ix(self):
        self.frame.ix[('bar', 'two'), 'B'] = 5
        self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5)

        # with integer labels
        df = self.frame.copy()
        df.columns = range(3)
        df.ix[('bar', 'two'), 1] = 7
        self.assertEquals(df.ix[('bar', 'two'), 1], 7)

    def test_fancy_slice_partial(self):
        result = self.frame.ix['bar':'baz']
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000,2):(2000,4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame['A'].sortlevel(0)
        self.assertRaises(Exception,
                          self.frame.reset_index()['A'].sortlevel)

        # preserve names
        self.assertEquals(a_sorted.index.names, self.frame.index.names)

    def test_delevel_infer_dtype(self):
        tuples = [tuple for tuple in cart_product(['foo', 'bar'],
                                                  [10, 20], [1.0, 1.1])]
        index = MultiIndex.from_tuples(tuples,
                                       names=['prm0', 'prm1', 'prm2'])
        df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'],
                       index=index)
        deleveled = df.reset_index()
        self.assert_(com.is_integer_dtype(deleveled['prm1']))
        self.assert_(com.is_float_dtype(deleveled['prm2']))

    def test_reset_index_with_drop(self):
        deleveled = self.ymd.reset_index(drop = True)
        self.assertEquals(len(deleveled.columns), len(self.ymd.columns))

        deleveled = self.series.reset_index()
        self.assert_(isinstance(deleveled, DataFrame))
        self.assert_(len(deleveled.columns) == len(self.series.index.levels)+1)

        deleveled = self.series.reset_index(drop = True)
        self.assert_(isinstance(deleveled, Series))

    def test_sortlevel_by_name(self):
        self.frame.index.names = ['first', 'second']
        result = self.frame.sortlevel(level='second')
        expected = self.frame.sortlevel(level=1)
        assert_frame_equal(result, expected)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df['foo'] = 'bar'
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft['foo', 'three'] = 'bar'

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1),
                           sorted_after.drop([('foo', 'three')], axis=1))

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)
                expected = expected.reindex_like(result).astype('i8')
                assert_frame_equal(result, expected)

        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan
        self.ymd.ix[1, [1, 2]] = np.nan
        self.ymd.ix[7, [0, 1]] = np.nan

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

        self.frame['D'] = 'foo'
        result = self.frame.count(level=0, numeric_only=True)
        assert_almost_equal(result.columns, ['A', 'B', 'C'])

    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2],
                                   [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

    def test_count_level_corner(self):
        s = self.frame['A'][:0]
        result = s.count(level=0)
        expected = Series(0, index=s.index.levels[0])
        assert_series_equal(result, expected)

        df = self.frame[:0]
        result = df.count(level=0)
        expected = DataFrame({}, index=s.index.levels[0],
                             columns=df.columns).fillna(0).astype(int)
        assert_frame_equal(result, expected)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_unstack_multiple_no_empty_columns(self):
        index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0),
                                        (1, 'baz', 1), (1, 'qux', 1)])

        s = Series(np.random.randn(4), index=index)

        unstacked = s.unstack([1, 2])
        expected = unstacked.dropna(axis=1, how='all')
        assert_frame_equal(unstacked, expected)

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

        # stack with negative number
        result = self.ymd.unstack(0).stack(-2)
        expected = self.ymd.unstack(0).stack(0)

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked['foo'], df['foo'].stack())
        self.assert_(stacked['bar'].dtype == np.float_)

    def test_unstack_bug(self):
        df = DataFrame({'state': ['naive','naive','naive',
                                  'activ','activ','activ'],
                        'exp':['a','b','b','b','a','a'],
                        'barcode':[1,2,3,4,1,3],
                        'v':['hi','hi','bye','bye','bye','peace'],
                        'extra': np.arange(6.)})

        result = df.groupby(['state','exp','barcode','v']).apply(len)

        unstacked = result.unstack()
        restacked = unstacked.stack()
        assert_series_equal(restacked,
                            result.reindex(restacked.index).astype(float))

    def test_stack_unstack_preserve_names(self):
        unstacked = self.frame.unstack()
        self.assertEquals(unstacked.index.name, 'first')
        self.assertEquals(unstacked.columns.names, ['exp', 'second'])

        restacked = unstacked.stack()
        self.assertEquals(restacked.index.names, self.frame.index.names)

    def test_unstack_level_name(self):
        result = self.frame.unstack('second')
        expected = self.frame.unstack(level=1)
        assert_frame_equal(result, expected)

    def test_stack_level_name(self):
        unstacked = self.frame.unstack('second')
        result = unstacked.stack('exp')
        expected = self.frame.unstack().stack(0)
        assert_frame_equal(result, expected)

        result = self.frame.stack('exp')
        expected = self.frame.stack()
        assert_series_equal(result, expected)

    def test_stack_unstack_multiple(self):
        unstacked = self.ymd.unstack(['year', 'month'])
        expected = self.ymd.unstack('year').unstack('month')
        assert_frame_equal(unstacked, expected)
        self.assertEquals(unstacked.columns.names,
                          expected.columns.names)

        # series
        s = self.ymd['A']
        s_unstacked = s.unstack(['year', 'month'])
        assert_frame_equal(s_unstacked, expected['A'])

        restacked = unstacked.stack(['year', 'month'])
        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
        restacked = restacked.sortlevel(0)

        assert_frame_equal(restacked, self.ymd)
        self.assertEquals(restacked.index.names, self.ymd.index.names)

        # GH #451
        unstacked = self.ymd.unstack([1, 2])
        expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all')
        assert_frame_equal(unstacked, expected)

        unstacked = self.ymd.unstack([2, 1])
        expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all')
        assert_frame_equal(unstacked, expected.ix[:, unstacked.columns])

    def test_groupby_transform(self):
        s = self.frame['A']
        grouper = s.index.get_level_values(0)

        grouped = s.groupby(grouper)

        applied = grouped.apply(lambda x: x * 2)
        expected = grouped.transform(lambda x: x * 2)
        assert_series_equal(applied.reindex(expected.index), expected)

    def test_groupby_corner(self):
        midx = MultiIndex(levels=[['foo'],['bar'],['baz']],
                          labels=[[0],[0],[0]], names=['one','two','three'])
        df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'],
                       index=midx)
        # should work
        df.groupby(level='three')

    def test_join(self):
        a = self.frame.ix[:5, ['A']]
        b = self.frame.ix[2:, ['B', 'C']]

        joined = a.join(b, how='outer').reindex(self.frame.index)
        expected = self.frame.copy()
        expected.values[np.isnan(joined.values)] = np.nan

        self.assert_(not np.isnan(joined.values).all())

        assert_frame_equal(joined, expected)

    def test_swaplevel(self):
        swapped = self.frame['A'].swaplevel(0, 1)
        swapped2 = self.frame['A'].swaplevel('first', 'second')
        self.assert_(not swapped.index.equals(self.frame.index))
        assert_series_equal(swapped, swapped2)

        back = swapped.swaplevel(0, 1)
        back2 = swapped.swaplevel('second', 'first')
        self.assert_(back.index.equals(self.frame.index))
        assert_series_equal(back, back2)

        ft = self.frame.T
        swapped = ft.swaplevel('first', 'second', axis=1)
        exp = self.frame.swaplevel('first', 'second').T
        assert_frame_equal(swapped, exp)

    def test_swaplevel_panel(self):
        panel = Panel({'ItemA' : self.frame,
                       'ItemB' : self.frame * 2})

        result = panel.swaplevel(0, 1, axis='major')
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)

    def test_reorder_levels(self):
        result = self.ymd.reorder_levels(['month', 'day', 'year'])
        expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
        assert_frame_equal(result, expected)

        result = self.ymd['A'].reorder_levels(['month', 'day', 'year'])
        expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2)
        assert_series_equal(result, expected)

        result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1)
        expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
        assert_frame_equal(result, expected)

        self.assertRaises(Exception, self.ymd.index.reorder_levels,
                          [1, 2, 3])

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        x = Series(data=[1,2,3],
                   index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)]))

        y = Series(data=[4,5,6],
                   index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 0, 1, 1, 1],
                                   [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels,
                           labels=[[0, 0, 1, 0, 1, 1],
                                   [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df['foo'].values[:] = 0
        self.assert_((df['foo'].values == 0).all())

        # but not if it's mixed-type
        df['foo', 'four'] = 'foo'
        df = df.sortlevel(0, axis=1)
        df['foo']['one'] = 2
        self.assert_((df['foo', 'one'] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df['foo', 'four'] = 'foo'

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df['foo']
        result2 = df.ix[:, 'foo']
        expected = df.reindex(columns=df.columns[arrays[0] == 'foo'])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs('foo')
        result2 = df.ix['foo']
        expected = df.reindex(df.index[arrays[0] == 'foo'])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'],
        ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s['qux']
        result2 = s.ix['qux']
        expected = s[arrays[0] == 'qux']
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

    def test_count(self):
        frame = self.frame.copy()
        frame.index.names = ['a', 'b']

        result = frame.count(level='b')
        expect = self.frame.count(level=1)
        assert_frame_equal(result, expect)

        result = frame.count(level='a')
        expect = self.frame.count(level=0)
        assert_frame_equal(result, expect)

        series = self.series.copy()
        series.index.names = ['a', 'b']

        result = series.count(level='b')
        expect = self.series.count(level=1)
        assert_series_equal(result, expect)

        result = series.count(level='a')
        expect = self.series.count(level=0)
        assert_series_equal(result, expect)

        self.assertRaises(Exception, series.count, 'x')
        self.assertRaises(Exception, frame.count, level='x')

    AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
                     'mad', 'std', 'var']

    def test_series_group_min_max(self):
        for op, level, skipna in cart_product(self.AGG_FUNCTIONS,
                                              range(2),
                                              [False, True]):
            grouped = self.series.groupby(level=level)
            aggf = lambda x: getattr(x, op)(skipna=skipna)
            # skipna=True
            leftside = grouped.agg(aggf)
            rightside = getattr(self.series, op)(level=level, skipna=skipna)
            assert_series_equal(leftside, rightside)

    def test_frame_group_ops(self):
        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan

        for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
                                                    range(2), range(2),
                                                    [False, True]):
            if axis == 0:
                frame = self.frame
            else:
                frame = self.frame.T

            grouped = frame.groupby(level=level, axis=axis)

            pieces = []
            def aggf(x):
                pieces.append(x)
                return getattr(x, op)(skipna=skipna, axis=axis)
            leftside = grouped.agg(aggf)
            rightside = getattr(frame, op)(level=level, axis=axis,
                                           skipna=skipna)

            # for good measure, groupby detail
            level_index = frame._get_axis(axis).levels[level]

            self.assert_(leftside._get_axis(axis).equals(level_index))
            self.assert_(rightside._get_axis(axis).equals(level_index))

            assert_frame_equal(leftside, rightside)

    def test_std_var_pass_ddof(self):
        index = MultiIndex.from_arrays([np.arange(5).repeat(10),
                                        np.tile(np.arange(10), 5)])
        df = DataFrame(np.random.randn(len(index), 5), index=index)

        for meth in ['var', 'std']:
            ddof = 4
            alt = lambda x: getattr(x, meth)(ddof=ddof)

            result = getattr(df[0], meth)(level=0, ddof=ddof)
            expected = df[0].groupby(level=0).agg(alt)
            assert_series_equal(result, expected)

            result = getattr(df, meth)(level=0, ddof=ddof)
            expected = df.groupby(level=0).agg(alt)
            assert_frame_equal(result, expected)


    def test_frame_series_agg_multiple_levels(self):
        result = self.ymd.sum(level=['year', 'month'])
        expected = self.ymd.groupby(level=['year', 'month']).sum()
        assert_frame_equal(result, expected)

        result = self.ymd['A'].sum(level=['year', 'month'])
        expected = self.ymd['A'].groupby(level=['year', 'month']).sum()
        assert_series_equal(result, expected)

    def test_groupby_multilevel(self):
        result = self.ymd.groupby(level=[0, 1]).mean()

        k1 = self.ymd.index.get_level_values(0)
        k2 = self.ymd.index.get_level_values(1)

        expected = self.ymd.groupby([k1, k2]).mean()

        assert_frame_equal(result, expected)
        self.assertEquals(result.index.names, self.ymd.index.names[:2])

        result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
        assert_frame_equal(result, result2)

    def test_groupby_multilevel_with_transform(self):
        pass

    def test_multilevel_consolidate(self):
        index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'),
                                        ('bar', 'one'), ('bar', 'two')])
        df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
        df['Totals', ''] = df.sum(1)
        df = df.consolidate()

    def test_ix_preserve_names(self):
        result = self.ymd.ix[2000]
        result2 = self.ymd['A'].ix[2000]
        self.assertEquals(result.index.names, self.ymd.index.names[1:])
        self.assertEquals(result2.index.names, self.ymd.index.names[1:])

        result = self.ymd.ix[2000, 2]
        result2 = self.ymd['A'].ix[2000, 2]
        self.assertEquals(result.index.name, self.ymd.index.names[2])
        self.assertEquals(result2.index.name, self.ymd.index.names[2])

    def test_partial_set(self):
        # GH #397
        df = self.ymd.copy()
        exp = self.ymd.copy()
        df.ix[2000, 4] = 0
        exp.ix[2000, 4].values[:] = 0
        assert_frame_equal(df, exp)

        df['A'].ix[2000, 4] = 1
        exp['A'].ix[2000, 4].values[:] = 1
        assert_frame_equal(df, exp)

        df.ix[2000] = 5
        exp.ix[2000].values[:] = 5
        assert_frame_equal(df, exp)

        # this works...for now
        df['A'].ix[14] = 5
        self.assertEquals(df['A'][14], 5)

    def test_unstack_preserve_types(self):
        # GH #403
        self.ymd['E'] = 'foo'
        self.ymd['F'] = 2

        unstacked = self.ymd.unstack('month')
        self.assert_(unstacked['A', 1].dtype == np.float64)
        self.assert_(unstacked['E', 1].dtype == np.object_)
        self.assert_(unstacked['F', 1].dtype == np.float64)

    def test_getitem_lowerdim_corner(self):
        self.assertRaises(KeyError, self.frame.ix.__getitem__,
                          (('bar', 'three'), 'B'))

        self.assertRaises(KeyError, self.frame.ix.__setitem__,
                          (('bar', 'three'), 'B'), 0)

    #----------------------------------------------------------------------
    # AMBIGUOUS CASES!

    def test_partial_ix_missing(self):
        raise nose.SkipTest

        result = self.ymd.ix[2000, 0]
        expected = self.ymd.ix[2000]['A']
        assert_series_equal(result, expected)

        # need to put in some work here

        # self.ymd.ix[2000, 0] = 0
        # self.assert_((self.ymd.ix[2000]['A'] == 0).all())

        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6))
        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0)

    #----------------------------------------------------------------------

    def test_to_html(self):
        self.ymd.columns.name = 'foo'
        self.ymd.to_html()
        self.ymd.T.to_html()

    def test_level_with_tuples(self):
        index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0),
                                    ('foo', 'qux', 0)],
                                   [0, 1]],
                           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[('foo', 'bar', 0)]
        result2 = series.ix[('foo', 'bar', 0)]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2))

        result = frame.ix[('foo', 'bar', 0)]
        result2 = frame.xs(('foo', 'bar', 0))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'),
                                    ('foo', 'qux')],
                                   [0, 1]],
                           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[('foo', 'bar')]
        result2 = series.ix[('foo', 'bar')]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        result = frame.ix[('foo', 'bar')]
        result2 = frame.xs(('foo', 'bar'))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_int_series_slicing(self):
        s = self.ymd['A']
        result = s[5:]
        expected = s.reindex(s.index[5:])
        assert_series_equal(result, expected)

        exp = self.ymd['A'].copy()
        s[5:] = 0
        exp.values[5:] = 0
        self.assert_(np.array_equal(s.values, exp.values))

        result = self.ymd[5:]
        expected = self.ymd.reindex(s.index[5:])
        assert_frame_equal(result, expected)

    def test_mixed_depth_get(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df['a']
        expected = df['a','','']
        assert_series_equal(result, expected)
        self.assertEquals(result.name, 'a')

        result = df['routine1','result1']
        expected = df['routine1','result1','']
        assert_series_equal(result, expected)
        self.assertEquals(result.name, ('routine1', 'result1'))

    def test_mixed_depth_insert(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df.copy()
        expected = df.copy()
        result['b'] = [1,2,3,4]
        expected['b','',''] = [1,2,3,4]
        assert_frame_equal(result, expected)

    def test_mixed_depth_drop(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        result = df.drop('a',axis=1)
        expected = df.drop([('a','','')],axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(['top'],axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        expected = expected.drop([('top','OD','wy')], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(('top', 'OD', 'wx'), axis=1)
        expected = df.drop([('top','OD','wx')], axis=1)
        assert_frame_equal(expected, result)

        expected = df.drop([('top','OD','wy')], axis=1)
        expected = df.drop('top', axis=1)

        result = df.drop('result1', level=1, axis=1)
        expected = df.drop([('routine1', 'result1', ''),
                            ('routine2', 'result1', '')], axis=1)
        assert_frame_equal(expected, result)

    def test_mixed_depth_pop(self):
        arrays = [[  'a', 'top', 'top', 'routine1', 'routine1', 'routine2'],
                  [   '',  'OD',  'OD', 'result1',   'result2',  'result1'],
                  [   '',  'wx',  'wy',        '',          '',         '']]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4,6),columns = index)

        df1 = df.copy()
        df2 = df.copy()
        result = df1.pop('a')
        expected = df2.pop(('a','',''))
        assert_series_equal(expected, result)
        assert_frame_equal(df1, df2)
        self.assertEquals(result.name,'a')

        expected = df1['top']
        df1 = df1.drop(['top'],axis=1)
        result = df2.pop('top')
        assert_frame_equal(expected, result)
        assert_frame_equal(df1, df2)

    def test_reindex_level_partial_selection(self):
        result = self.frame.reindex(['foo', 'qux'], level=0)
        expected = self.frame.ix[[0, 1, 2, 7, 8, 9]]
        assert_frame_equal(result, expected)

        result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0)
        assert_frame_equal(result, expected.T)

        result = self.frame.ix[['foo', 'qux']]
        assert_frame_equal(result, expected)

        result = self.frame['A'].ix[['foo', 'qux']]
        assert_series_equal(result, expected['A'])

        result = self.frame.T.ix[:, ['foo', 'qux']]
        assert_frame_equal(result, expected.T)

    def test_setitem_multiple_partial(self):
        expected = self.frame.copy()
        result = self.frame.copy()
        result.ix[['foo', 'bar']] = 0
        expected.ix['foo'] = 0
        expected.ix['bar'] = 0
        assert_frame_equal(result, expected)

        expected = self.frame.copy()
        result = self.frame.copy()
        result.ix['foo':'bar'] = 0
        expected.ix['foo'] = 0
        expected.ix['bar'] = 0
        assert_frame_equal(result, expected)

        expected = self.frame['A'].copy()
        result = self.frame['A'].copy()
        result.ix[['foo', 'bar']] = 0
        expected.ix['foo'] = 0
        expected.ix['bar'] = 0
        assert_series_equal(result, expected)

        expected = self.frame['A'].copy()
        result = self.frame['A'].copy()
        result.ix['foo':'bar'] = 0
        expected.ix['foo'] = 0
        expected.ix['bar'] = 0
        assert_series_equal(result, expected)

    def test_drop_level(self):
        result = self.frame.drop(['bar', 'qux'], level='first')
        expected = self.frame.ix[[0, 1, 2, 5, 6]]
        assert_frame_equal(result, expected)

        result = self.frame.drop(['two'], level='second')
        expected = self.frame.ix[[0, 2, 3, 6, 7, 9]]
        assert_frame_equal(result, expected)

        result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first')
        expected = self.frame.ix[[0, 1, 2, 5, 6]].T
        assert_frame_equal(result, expected)

        result = self.frame.T.drop(['two'], axis=1, level='second')
        expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T
        assert_frame_equal(result, expected)

    def test_unicode_repr_issues(self):
        levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']),
                  Index([0, 1])]
        labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)]
        index = MultiIndex(levels=levels, labels=labels)

        repr(index.levels)
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Exemple #56
0
class TestMultiLevel(unittest.TestCase):
    def setUp(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp"))

        self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"])

        # create test series object
        arrays = [
            ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)
        s[3] = np.NaN
        self.series = s

        tm.N = 100
        self.tdf = tm.makeTimeDataFrame()
        self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()

        # use Int64Index, to make sure things work
        self.ymd.index.levels = [lev.astype("i8") for lev in self.ymd.index.levels]
        self.ymd.index.names = ["year", "month", "day"]

    def test_append(self):
        a, b = self.frame[:5], self.frame[5:]

        result = a.append(b)
        tm.assert_frame_equal(result, self.frame)

        result = a["A"].append(b["A"])
        tm.assert_series_equal(result, self.frame["A"])

    def test_reindex_level(self):
        # axis=0
        month_sums = self.ymd.sum(level="month")
        result = month_sums.reindex(self.ymd.index, level=1)
        expected = self.ymd.groupby(level="month").transform(np.sum)

        assert_frame_equal(result, expected)

        # Series
        result = month_sums["A"].reindex(self.ymd.index, level=1)
        expected = self.ymd["A"].groupby(level="month").transform(np.sum)
        assert_series_equal(result, expected)

        # axis=1
        month_sums = self.ymd.T.sum(axis=1, level="month")
        result = month_sums.reindex(columns=self.ymd.index, level=1)
        expected = self.ymd.groupby(level="month").transform(np.sum).T
        assert_frame_equal(result, expected)

    def test_binops_level(self):
        def _check_op(opname):
            op = getattr(DataFrame, opname)
            month_sums = self.ymd.sum(level="month")
            result = op(self.ymd, month_sums, level="month")
            broadcasted = self.ymd.groupby(level="month").transform(np.sum)
            expected = op(self.ymd, broadcasted)
            assert_frame_equal(result, expected)

            # Series
            op = getattr(Series, opname)
            result = op(self.ymd["A"], month_sums["A"], level="month")
            broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum)
            expected = op(self.ymd["A"], broadcasted)
            assert_series_equal(result, expected)

        _check_op("sub")
        _check_op("add")
        _check_op("mul")
        _check_op("div")

    def test_pickle(self):
        import cPickle

        def _test_roundtrip(frame):
            pickled = cPickle.dumps(frame)
            unpickled = cPickle.loads(pickled)
            assert_frame_equal(frame, unpickled)

        _test_roundtrip(self.frame)
        _test_roundtrip(self.frame.T)
        _test_roundtrip(self.ymd)
        _test_roundtrip(self.ymd.T)

    def test_reindex(self):
        reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]]
        expected = self.frame.ix[[0, 3]]
        assert_frame_equal(reindexed, expected)

    def test_reindex_preserve_levels(self):
        new_index = self.ymd.index[::10]
        chunk = self.ymd.reindex(new_index)
        self.assert_(chunk.index is new_index)

        chunk = self.ymd.ix[new_index]
        self.assert_(chunk.index is new_index)

        ymdT = self.ymd.T
        chunk = ymdT.reindex(columns=new_index)
        self.assert_(chunk.columns is new_index)

        chunk = ymdT.ix[:, new_index]
        self.assert_(chunk.columns is new_index)

    def test_sort_index_preserve_levels(self):
        result = self.frame.sort_index()
        self.assertEquals(result.index.names, self.frame.index.names)

    def test_repr_to_string(self):
        repr(self.frame)
        repr(self.ymd)
        repr(self.frame.T)
        repr(self.ymd.T)

        buf = StringIO()
        self.frame.to_string(buf=buf)
        self.ymd.to_string(buf=buf)
        self.frame.T.to_string(buf=buf)
        self.ymd.T.to_string(buf=buf)

    def test_getitem_simple(self):
        df = self.frame.T

        col = df["foo", "one"]
        assert_almost_equal(col.values, df.values[:, 0])
        self.assertRaises(KeyError, df.__getitem__, ("foo", "four"))
        self.assertRaises(KeyError, df.__getitem__, "foobar")

    def test_series_getitem(self):
        s = self.ymd["A"]

        result = s[2000, 3]
        result2 = s.ix[2000, 3]
        expected = s.reindex(s.index[42:65])
        expected.index = expected.index.droplevel(0).droplevel(0)
        assert_series_equal(result, expected)

        result = s[2000, 3, 10]
        expected = s[49]
        self.assertEquals(result, expected)

        # fancy
        result = s.ix[[(2000, 3, 10), (2000, 3, 13)]]
        expected = s.reindex(s.index[49:51])
        assert_series_equal(result, expected)

        # key error
        self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4))

    def test_series_getitem_corner(self):
        s = self.ymd["A"]

        # don't segfault, GH #495
        # out of bounds access
        self.assertRaises(IndexError, s.__getitem__, len(self.ymd))

        # generator
        result = s[(x > 0 for x in s)]
        expected = s[s > 0]
        assert_series_equal(result, expected)

    def test_series_setitem(self):
        s = self.ymd["A"]

        s[2000, 3] = np.nan
        self.assert_(isnull(s.values[42:65]).all())
        self.assert_(notnull(s.values[:42]).all())
        self.assert_(notnull(s.values[65:]).all())

        s[2000, 3, 10] = np.nan
        self.assert_(isnull(s[49]))

    def test_series_slice_partial(self):
        pass

    def test_frame_getitem_setitem_slice(self):
        # getitem
        result = self.frame.ix[:4]
        expected = self.frame[:4]
        assert_frame_equal(result, expected)

        # setitem
        cp = self.frame.copy()
        cp.ix[:4] = 0

        self.assert_((cp.values[:4] == 0).all())
        self.assert_((cp.values[4:] != 0).all())

    def test_frame_getitem_setitem_multislice(self):
        levels = [["t1", "t2"], ["a", "b", "c"]]
        labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]]
        midx = MultiIndex(labels=labels, levels=levels, names=[None, "id"])
        df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx)

        result = df.ix[:, "value"]
        assert_series_equal(df["value"], result)

        result = df.ix[1:3, "value"]
        assert_series_equal(df["value"][1:3], result)

        result = df.ix[:, :]
        assert_frame_equal(df, result)

        result = df
        df.ix[:, "value"] = 10
        result["value"] = 10
        assert_frame_equal(df, result)

        df.ix[:, :] = 10
        assert_frame_equal(df, result)

    def test_getitem_tuple_plus_slice(self):
        # GH #671
        df = DataFrame({"a": range(10), "b": range(10), "c": np.random.randn(10), "d": np.random.randn(10)})

        idf = df.set_index(["a", "b"])

        result = idf.ix[(0, 0), :]
        expected = idf.ix[0, 0]
        expected2 = idf.xs((0, 0))

        assert_series_equal(result, expected)
        assert_series_equal(result, expected2)

    def test_xs(self):
        xs = self.frame.xs(("bar", "two"))
        xs2 = self.frame.ix[("bar", "two")]

        assert_series_equal(xs, xs2)
        assert_almost_equal(xs.values, self.frame.values[4])

    def test_xs_partial(self):
        result = self.frame.xs("foo")
        result2 = self.frame.ix["foo"]
        expected = self.frame.T["foo"].T
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_xs_level(self):
        result = self.frame.xs("two", level="second")
        expected = self.frame[self.frame.index.get_level_values(1) == "two"]
        expected.index = expected.index.droplevel(1)

        assert_frame_equal(result, expected)

        index = MultiIndex.from_tuples([("x", "y", "z"), ("a", "b", "c"), ("p", "q", "r")])
        df = DataFrame(np.random.randn(3, 5), index=index)
        result = df.xs("c", level=2)
        expected = df[1:2]
        expected.index = expected.index.droplevel(2)
        assert_frame_equal(result, expected)

    def test_xs_level_multiple(self):
        from pandas import read_table
        from StringIO import StringIO

        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep="\s+")

        result = df.xs(("a", 4), level=["one", "four"])
        expected = df.xs("a").xs(4, level="four")
        assert_frame_equal(result, expected)

    def test_xs_level0(self):
        from pandas import read_table
        from StringIO import StringIO

        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep="\s+")

        result = df.xs("a", level=0)
        expected = df.xs("a")
        self.assertEqual(len(result), 2)
        assert_frame_equal(result, expected)

    def test_xs_level_series(self):
        s = self.frame["A"]
        result = s[:, "two"]
        expected = self.frame.xs("two", level=1)["A"]
        assert_series_equal(result, expected)

        s = self.ymd["A"]
        result = s[2000, 5]
        expected = self.ymd.ix[2000, 5]["A"]
        assert_series_equal(result, expected)

        # not implementing this for now

        self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4)))

        # result = s[2000, 3:4]
        # lv =s.index.get_level_values(1)
        # expected = s[(lv == 3) | (lv == 4)]
        # expected.index = expected.index.droplevel(0)
        # assert_series_equal(result, expected)

        # can do this though

    def test_get_loc_single_level(self):
        s = Series(np.random.randn(len(self.single_level)), index=self.single_level)
        for k in self.single_level.values:
            s[k]

    def test_getitem_toplevel(self):
        df = self.frame.T

        result = df["foo"]
        expected = df.reindex(columns=df.columns[:3])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)

        result = df["bar"]
        result2 = df.ix[:, "bar"]

        expected = df.reindex(columns=df.columns[3:5])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result, result2)

    def test_getitem_setitem_slice_integers(self):
        index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

        frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"])
        res = frame.ix[1:2]
        exp = frame.reindex(frame.index[2:])
        assert_frame_equal(res, exp)

        frame.ix[1:2] = 7
        self.assert_((frame.ix[1:2] == 7).values.all())

        series = Series(np.random.randn(len(index)), index=index)

        res = series.ix[1:2]
        exp = series.reindex(series.index[2:])
        assert_series_equal(res, exp)

        series.ix[1:2] = 7
        self.assert_((series.ix[1:2] == 7).values.all())

    def test_getitem_int(self):
        levels = [[0, 1], [0, 1, 2]]
        labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]
        index = MultiIndex(levels=levels, labels=labels)

        frame = DataFrame(np.random.randn(6, 2), index=index)

        result = frame.ix[1]
        expected = frame[-3:]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)

        # raises exception
        self.assertRaises(KeyError, frame.ix.__getitem__, 3)

        # however this will work
        result = self.frame.ix[2]
        expected = self.frame.xs(self.frame.index[2])
        assert_series_equal(result, expected)

    def test_getitem_partial(self):
        ymd = self.ymd.T
        result = ymd[2000, 2]

        expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1])
        expected.columns = expected.columns.droplevel(0).droplevel(0)
        assert_frame_equal(result, expected)

    def test_getitem_slice_not_sorted(self):
        df = self.frame.sortlevel(1).T

        # buglet with int typechecking
        result = df.ix[:, : np.int32(3)]
        expected = df.reindex(columns=df.columns[:3])
        assert_frame_equal(result, expected)

    def test_setitem_change_dtype(self):
        dft = self.frame.T
        s = dft["foo", "two"]
        dft["foo", "two"] = s > s.median()
        assert_series_equal(dft["foo", "two"], s > s.median())
        self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex))

        reindexed = dft.reindex(columns=[("foo", "two")])
        assert_series_equal(reindexed["foo", "two"], s > s.median())

    def test_frame_setitem_ix(self):
        self.frame.ix[("bar", "two"), "B"] = 5
        self.assertEquals(self.frame.ix[("bar", "two"), "B"], 5)

        # with integer labels
        df = self.frame.copy()
        df.columns = range(3)
        df.ix[("bar", "two"), 1] = 7
        self.assertEquals(df.ix[("bar", "two"), 1], 7)

    def test_fancy_slice_partial(self):
        result = self.frame.ix["bar":"baz"]
        expected = self.frame[3:7]
        assert_frame_equal(result, expected)

        result = self.ymd.ix[(2000, 2):(2000, 4)]
        lev = self.ymd.index.labels[1]
        expected = self.ymd[(lev >= 1) & (lev <= 3)]
        assert_frame_equal(result, expected)

    def test_sortlevel(self):
        df = self.frame.copy()
        df.index = np.arange(len(df))
        self.assertRaises(Exception, df.sortlevel, 0)

        # axis=1

        # series
        a_sorted = self.frame["A"].sortlevel(0)
        self.assertRaises(Exception, self.frame.reset_index()["A"].sortlevel)

        # preserve names
        self.assertEquals(a_sorted.index.names, self.frame.index.names)

    def test_delevel_infer_dtype(self):
        tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])]
        index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"])
        df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index)
        deleveled = df.reset_index()
        self.assert_(com.is_integer_dtype(deleveled["prm1"]))
        self.assert_(com.is_float_dtype(deleveled["prm2"]))

    def test_reset_index_with_drop(self):
        deleveled = self.ymd.reset_index(drop=True)
        self.assertEquals(len(deleveled.columns), len(self.ymd.columns))

        deleveled = self.series.reset_index()
        self.assert_(isinstance(deleveled, DataFrame))
        self.assert_(len(deleveled.columns) == len(self.series.index.levels) + 1)

        deleveled = self.series.reset_index(drop=True)
        self.assert_(isinstance(deleveled, Series))

    def test_sortlevel_by_name(self):
        self.frame.index.names = ["first", "second"]
        result = self.frame.sortlevel(level="second")
        expected = self.frame.sortlevel(level=1)
        assert_frame_equal(result, expected)

    def test_sortlevel_mixed(self):
        sorted_before = self.frame.sortlevel(1)

        df = self.frame.copy()
        df["foo"] = "bar"
        sorted_after = df.sortlevel(1)
        assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1))

        dft = self.frame.T
        sorted_before = dft.sortlevel(1, axis=1)
        dft["foo", "three"] = "bar"

        sorted_after = dft.sortlevel(1, axis=1)
        assert_frame_equal(
            sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1)
        )

    def test_count_level(self):
        def _check_counts(frame, axis=0):
            index = frame._get_axis(axis)
            for i in range(index.nlevels):
                result = frame.count(axis=axis, level=i)
                expected = frame.groupby(axis=axis, level=i).count(axis=axis)
                expected = expected.reindex_like(result).astype("i8")
                assert_frame_equal(result, expected)

        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan
        self.ymd.ix[1, [1, 2]] = np.nan
        self.ymd.ix[7, [0, 1]] = np.nan

        _check_counts(self.frame)
        _check_counts(self.ymd)
        _check_counts(self.frame.T, axis=1)
        _check_counts(self.ymd.T, axis=1)

        # can't call with level on regular DataFrame
        df = tm.makeTimeDataFrame()
        self.assertRaises(Exception, df.count, level=0)

        self.frame["D"] = "foo"
        result = self.frame.count(level=0, numeric_only=True)
        assert_almost_equal(result.columns, ["A", "B", "C"])

    def test_count_level_series(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]
        )

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))

    def test_count_level_corner(self):
        s = self.frame["A"][:0]
        result = s.count(level=0)
        expected = Series(0, index=s.index.levels[0])
        assert_series_equal(result, expected)

        df = self.frame[:0]
        result = df.count(level=0)
        expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int)
        assert_frame_equal(result, expected)

    def test_unstack(self):
        # just check that it works for now
        unstacked = self.ymd.unstack()
        unstacked2 = unstacked.unstack()

        # test that ints work
        unstacked = self.ymd.astype(int).unstack()

    def test_stack(self):
        # regular roundtrip
        unstacked = self.ymd.unstack()
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        unlexsorted = self.ymd.sortlevel(2)

        unstacked = unlexsorted.unstack(2)
        restacked = unstacked.stack()
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted[::-1]
        unstacked = unlexsorted.unstack(1)
        restacked = unstacked.stack().swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        unlexsorted = unlexsorted.swaplevel(0, 1)
        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
        restacked = unstacked.stack(0).swaplevel(1, 2)
        assert_frame_equal(restacked.sortlevel(0), self.ymd)

        # columns unsorted
        unstacked = self.ymd.unstack()
        unstacked = unstacked.sort(axis=1, ascending=False)
        restacked = unstacked.stack()
        assert_frame_equal(restacked, self.ymd)

        # more than 2 levels in the columns
        unstacked = self.ymd.unstack(1).unstack(1)

        result = unstacked.stack(1)
        expected = self.ymd.unstack()
        assert_frame_equal(result, expected)

        result = unstacked.stack(2)
        expected = self.ymd.unstack(1)
        assert_frame_equal(result, expected)

        result = unstacked.stack(0)
        expected = self.ymd.stack().unstack(1).unstack(1)
        assert_frame_equal(result, expected)

        # not all levels present in each echelon
        unstacked = self.ymd.unstack(2).ix[:, ::3]
        stacked = unstacked.stack().stack()
        ymd_stacked = self.ymd.stack()
        assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))

        # stack with negative number
        result = self.ymd.unstack(0).stack(-2)
        expected = self.ymd.unstack(0).stack(0)

    def test_stack_mixed_dtype(self):
        df = self.frame.T
        df["foo", "four"] = "foo"
        df = df.sortlevel(1, axis=1)

        stacked = df.stack()
        assert_series_equal(stacked["foo"], df["foo"].stack())
        self.assert_(stacked["bar"].dtype == np.float_)

    def test_unstack_bug(self):
        df = DataFrame(
            {
                "state": ["naive", "naive", "naive", "activ", "activ", "activ"],
                "exp": ["a", "b", "b", "b", "a", "a"],
                "barcode": [1, 2, 3, 4, 1, 3],
                "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
                "extra": np.arange(6.0),
            }
        )

        result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)

        unstacked = result.unstack()
        restacked = unstacked.stack()
        assert_series_equal(restacked, result.reindex(restacked.index).astype(float))

    def test_stack_unstack_preserve_names(self):
        unstacked = self.frame.unstack()
        self.assertEquals(unstacked.index.name, "first")
        self.assertEquals(unstacked.columns.names, ["exp", "second"])

        restacked = unstacked.stack()
        self.assertEquals(restacked.index.names, self.frame.index.names)

    def test_unstack_level_name(self):
        result = self.frame.unstack("second")
        expected = self.frame.unstack(level=1)
        assert_frame_equal(result, expected)

    def test_stack_level_name(self):
        unstacked = self.frame.unstack("second")
        result = unstacked.stack("exp")
        expected = self.frame.unstack().stack(0)
        assert_frame_equal(result, expected)

        result = self.frame.stack("exp")
        expected = self.frame.stack()
        assert_series_equal(result, expected)

    def test_stack_unstack_multiple(self):
        unstacked = self.ymd.unstack(["year", "month"])
        expected = self.ymd.unstack("year").unstack("month")
        assert_frame_equal(unstacked, expected)
        self.assertEquals(unstacked.columns.names, expected.columns.names)

        # series
        s = self.ymd["A"]
        s_unstacked = s.unstack(["year", "month"])
        assert_frame_equal(s_unstacked, expected["A"])

        restacked = unstacked.stack(["year", "month"])
        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
        restacked = restacked.sortlevel(0)

        assert_frame_equal(restacked, self.ymd)
        self.assertEquals(restacked.index.names, self.ymd.index.names)

        # GH #451
        unstacked = self.ymd.unstack([1, 2])
        expected = self.ymd.unstack(1).unstack(1)
        assert_frame_equal(unstacked, expected)

        unstacked = self.ymd.unstack([2, 1])
        expected = self.ymd.unstack(2).unstack(1)
        assert_frame_equal(unstacked, expected)

    def test_groupby_transform(self):
        s = self.frame["A"]
        grouper = s.index.get_level_values(0)

        grouped = s.groupby(grouper)

        applied = grouped.apply(lambda x: x * 2)
        expected = grouped.transform(lambda x: x * 2)
        assert_series_equal(applied.reindex(expected.index), expected)

    def test_groupby_corner(self):
        midx = MultiIndex(levels=[["foo"], ["bar"], ["baz"]], labels=[[0], [0], [0]], names=["one", "two", "three"])
        df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx)
        # should work
        df.groupby(level="three")

    def test_join(self):
        a = self.frame.ix[:5, ["A"]]
        b = self.frame.ix[2:, ["B", "C"]]

        joined = a.join(b, how="outer").reindex(self.frame.index)
        expected = self.frame.copy()
        expected.values[np.isnan(joined.values)] = np.nan

        self.assert_(not np.isnan(joined.values).all())

        assert_frame_equal(joined, expected)

    def test_swaplevel(self):
        swapped = self.frame["A"].swaplevel(0, 1)
        swapped2 = self.frame["A"].swaplevel("first", "second")
        self.assert_(not swapped.index.equals(self.frame.index))
        assert_series_equal(swapped, swapped2)

        back = swapped.swaplevel(0, 1)
        back2 = swapped.swaplevel("second", "first")
        self.assert_(back.index.equals(self.frame.index))
        assert_series_equal(back, back2)

        ft = self.frame.T
        swapped = ft.swaplevel("first", "second", axis=1)
        exp = self.frame.swaplevel("first", "second").T
        assert_frame_equal(swapped, exp)

    def test_swaplevel_panel(self):
        panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2})

        result = panel.swaplevel(0, 1, axis="major")
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)

    def test_reorder_levels(self):
        result = self.ymd.reorder_levels(["month", "day", "year"])
        expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2)
        assert_frame_equal(result, expected)

        result = self.ymd["A"].reorder_levels(["month", "day", "year"])
        expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2)
        assert_series_equal(result, expected)

        result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1)
        expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1)
        assert_frame_equal(result, expected)

        self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3])

    def test_insert_index(self):
        df = self.ymd[:5].T
        df[2000, 1, 10] = df[2000, 1, 7]
        self.assert_(isinstance(df.columns, MultiIndex))
        self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all())

    def test_alignment(self):
        x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]))

        y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]))

        res = x - y
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

        # hit non-monotonic code path
        res = x[::-1] - y[::-1]
        exp_index = x.index.union(y.index)
        exp = x.reindex(exp_index) - y.reindex(exp_index)
        assert_series_equal(res, exp)

    def test_is_lexsorted(self):
        levels = [[0, 1], [0, 1, 2]]

        index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
        self.assert_(index.is_lexsorted())

        index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]])
        self.assert_(not index.is_lexsorted())

        index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]])
        self.assert_(not index.is_lexsorted())
        self.assert_(index.lexsort_depth == 0)

    def test_frame_getitem_view(self):
        df = self.frame.T
        df["foo"].values[:] = 0
        self.assert_((df["foo"].values == 0).all())

        # but not if it's mixed-type
        df["foo", "four"] = "foo"
        df = df.sortlevel(0, axis=1)
        df["foo"]["one"] = 2
        self.assert_((df["foo", "one"] == 0).all())

    def test_frame_getitem_not_sorted(self):
        df = self.frame.T
        df["foo", "four"] = "foo"

        arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())]

        result = df["foo"]
        result2 = df.ix[:, "foo"]
        expected = df.reindex(columns=df.columns[arrays[0] == "foo"])
        expected.columns = expected.columns.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        df = df.T
        result = df.xs("foo")
        result2 = df.ix["foo"]
        expected = df.reindex(df.index[arrays[0] == "foo"])
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_series_getitem_not_sorted(self):
        arrays = [
            ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        tuples = zip(*arrays)
        index = MultiIndex.from_tuples(tuples)
        s = Series(randn(8), index=index)

        arrays = [np.array(x) for x in zip(*index.get_tuple_index())]

        result = s["qux"]
        result2 = s.ix["qux"]
        expected = s[arrays[0] == "qux"]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

    def test_count(self):
        frame = self.frame.copy()
        frame.index.names = ["a", "b"]

        result = frame.count(level="b")
        expect = self.frame.count(level=1)
        assert_frame_equal(result, expect)

        result = frame.count(level="a")
        expect = self.frame.count(level=0)
        assert_frame_equal(result, expect)

        series = self.series.copy()
        series.index.names = ["a", "b"]

        result = series.count(level="b")
        expect = self.series.count(level=1)
        assert_series_equal(result, expect)

        result = series.count(level="a")
        expect = self.series.count(level=0)
        assert_series_equal(result, expect)

        self.assertRaises(Exception, series.count, "x")
        self.assertRaises(Exception, frame.count, level="x")

    AGG_FUNCTIONS = ["sum", "prod", "min", "max", "median", "mean", "skew", "mad", "std", "var"]

    def test_series_group_min_max(self):
        for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]):
            grouped = self.series.groupby(level=level)
            aggf = lambda x: getattr(x, op)(skipna=skipna)
            # skipna=True
            leftside = grouped.agg(aggf)
            rightside = getattr(self.series, op)(level=level, skipna=skipna)
            assert_series_equal(leftside, rightside)

    def test_frame_group_ops(self):
        self.frame.ix[1, [1, 2]] = np.nan
        self.frame.ix[7, [0, 1]] = np.nan

        for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]):
            if axis == 0:
                frame = self.frame
            else:
                frame = self.frame.T

            grouped = frame.groupby(level=level, axis=axis)

            aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis)
            leftside = grouped.agg(aggf)
            rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna)

            # for good measure, groupby detail
            level_index = frame._get_axis(axis).levels[level]

            self.assert_(leftside._get_axis(axis).equals(level_index))
            self.assert_(rightside._get_axis(axis).equals(level_index))

            assert_frame_equal(leftside, rightside)

    def test_frame_series_agg_multiple_levels(self):
        result = self.ymd.sum(level=["year", "month"])
        expected = self.ymd.groupby(level=["year", "month"]).sum()
        assert_frame_equal(result, expected)

        result = self.ymd["A"].sum(level=["year", "month"])
        expected = self.ymd["A"].groupby(level=["year", "month"]).sum()
        assert_series_equal(result, expected)

    def test_groupby_multilevel(self):
        result = self.ymd.groupby(level=[0, 1]).mean()

        k1 = self.ymd.index.get_level_values(0)
        k2 = self.ymd.index.get_level_values(1)

        expected = self.ymd.groupby([k1, k2]).mean()

        assert_frame_equal(result, expected)
        self.assertEquals(result.index.names, self.ymd.index.names[:2])

        result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean()
        assert_frame_equal(result, result2)

    def test_groupby_multilevel_with_transform(self):
        pass

    def test_multilevel_consolidate(self):
        index = MultiIndex.from_tuples([("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")])
        df = DataFrame(np.random.randn(4, 4), index=index, columns=index)
        df["Totals", ""] = df.sum(1)
        df = df.consolidate()

    def test_ix_preserve_names(self):
        result = self.ymd.ix[2000]
        result2 = self.ymd["A"].ix[2000]
        self.assertEquals(result.index.names, self.ymd.index.names[1:])
        self.assertEquals(result2.index.names, self.ymd.index.names[1:])

        result = self.ymd.ix[2000, 2]
        result2 = self.ymd["A"].ix[2000, 2]
        self.assertEquals(result.index.name, self.ymd.index.names[2])
        self.assertEquals(result2.index.name, self.ymd.index.names[2])

    def test_partial_set(self):
        # GH #397
        df = self.ymd.copy()
        exp = self.ymd.copy()
        df.ix[2000, 4] = 0
        exp.ix[2000, 4].values[:] = 0
        assert_frame_equal(df, exp)

        df["A"].ix[2000, 4] = 1
        exp["A"].ix[2000, 4].values[:] = 1
        assert_frame_equal(df, exp)

        df.ix[2000] = 5
        exp.ix[2000].values[:] = 5
        assert_frame_equal(df, exp)

        # this works...for now
        df["A"].ix[14] = 5
        self.assertEquals(df["A"][14], 5)

    def test_unstack_preserve_types(self):
        # GH #403
        self.ymd["E"] = "foo"
        self.ymd["F"] = 2

        unstacked = self.ymd.unstack("month")
        self.assert_(unstacked["A", 1].dtype == np.float64)
        self.assert_(unstacked["E", 1].dtype == np.object_)
        self.assert_(unstacked["F", 1].dtype == np.float64)

    def test_getitem_lowerdim_corner(self):
        self.assertRaises(KeyError, self.frame.ix.__getitem__, (("bar", "three"), "B"))

        self.assertRaises(KeyError, self.frame.ix.__setitem__, (("bar", "three"), "B"), 0)

    # ----------------------------------------------------------------------
    # AMBIGUOUS CASES!

    def test_partial_ix_missing(self):
        raise nose.SkipTest

        result = self.ymd.ix[2000, 0]
        expected = self.ymd.ix[2000]["A"]
        assert_series_equal(result, expected)

        # need to put in some work here

        # self.ymd.ix[2000, 0] = 0
        # self.assert_((self.ymd.ix[2000]['A'] == 0).all())

        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6))
        self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0)

    def test_fancy_2d(self):
        raise nose.SkipTest

        result = self.frame.ix["foo", "B"]
        expected = self.frame.xs("foo")["B"]
        assert_series_equal(result, expected)

        ft = self.frame.T
        result = ft.ix["B", "foo"]
        expected = ft.xs("B")["foo"]
        assert_series_equal(result, expected)

    # ----------------------------------------------------------------------

    def test_to_html(self):
        self.ymd.columns.name = "foo"
        self.ymd.to_html()
        self.ymd.T.to_html()

    def test_level_with_tuples(self):
        index = MultiIndex(
            levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
            labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
        )

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[("foo", "bar", 0)]
        result2 = series.ix[("foo", "bar", 0)]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        self.assertRaises(KeyError, series.__getitem__, (("foo", "bar", 0), 2))

        result = frame.ix[("foo", "bar", 0)]
        result2 = frame.xs(("foo", "bar", 0))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

        index = MultiIndex(
            levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
            labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
        )

        series = Series(np.random.randn(6), index=index)
        frame = DataFrame(np.random.randn(6, 4), index=index)

        result = series[("foo", "bar")]
        result2 = series.ix[("foo", "bar")]
        expected = series[:2]
        expected.index = expected.index.droplevel(0)
        assert_series_equal(result, expected)
        assert_series_equal(result2, expected)

        result = frame.ix[("foo", "bar")]
        result2 = frame.xs(("foo", "bar"))
        expected = frame[:2]
        expected.index = expected.index.droplevel(0)
        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)

    def test_int_series_slicing(self):
        s = self.ymd["A"]
        result = s[5:]
        expected = s.reindex(s.index[5:])
        assert_series_equal(result, expected)

        exp = self.ymd["A"].copy()
        s[5:] = 0
        exp.values[5:] = 0
        self.assert_(np.array_equal(s.values, exp.values))

        result = self.ymd[5:]
        expected = self.ymd.reindex(s.index[5:])
        assert_frame_equal(result, expected)

    def test_mixed_depth_get(self):
        arrays = [
            ["a", "top", "top", "routine1", "routine1", "routine2"],
            ["", "OD", "OD", "result1", "result2", "result1"],
            ["", "wx", "wy", "", "", ""],
        ]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df["a"]
        expected = df["a", "", ""]
        assert_series_equal(result, expected)
        self.assertEquals(result.name, "a")

        result = df["routine1", "result1"]
        expected = df["routine1", "result1", ""]
        assert_series_equal(result, expected)
        self.assertEquals(result.name, ("routine1", "result1"))

    def test_mixed_depth_insert(self):
        arrays = [
            ["a", "top", "top", "routine1", "routine1", "routine2"],
            ["", "OD", "OD", "result1", "result2", "result1"],
            ["", "wx", "wy", "", "", ""],
        ]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df.copy()
        expected = df.copy()
        result["b"] = [1, 2, 3, 4]
        expected["b", "", ""] = [1, 2, 3, 4]
        assert_frame_equal(result, expected)

    def test_mixed_depth_drop(self):
        arrays = [
            ["a", "top", "top", "routine1", "routine1", "routine2"],
            ["", "OD", "OD", "result1", "result2", "result1"],
            ["", "wx", "wy", "", "", ""],
        ]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        result = df.drop("a", axis=1)
        expected = df.drop([("a", "", "")], axis=1)
        assert_frame_equal(expected, result)

        result = df.drop(["top"], axis=1)
        expected = df.drop([("top", "OD", "wx")], axis=1)
        expected = expected.drop([("top", "OD", "wy")], axis=1)
        assert_frame_equal(expected, result)

    def test_mixed_depth_pop(self):
        arrays = [
            ["a", "top", "top", "routine1", "routine1", "routine2"],
            ["", "OD", "OD", "result1", "result2", "result1"],
            ["", "wx", "wy", "", "", ""],
        ]

        tuples = zip(*arrays)
        tuples.sort()
        index = MultiIndex.from_tuples(tuples)
        df = DataFrame(randn(4, 6), columns=index)

        df1 = df.copy()
        df2 = df.copy()
        result = df1.pop("a")
        expected = df2.pop(("a", "", ""))
        assert_series_equal(expected, result)
        assert_frame_equal(df1, df2)
        self.assertEquals(result.name, "a")

        expected = df1["top"]
        df1 = df1.drop(["top"], axis=1)
        result = df2.pop("top")
        assert_frame_equal(expected, result)
        assert_frame_equal(df1, df2)




#casetovars varstocases

d = {'one':[1,1],'two':[2,2]}
i = ['a','b']

# Create dataframe
df = DataFrame(data = d, index = i)
df

#varstocases
df.stack()

#casestoVars
df.unstack()



#aggregate
d = {'one':[1,1,1,1,1],'two':[2,2,2,2,2],'letter':['a','a','b','b','c']}

# Create dataframe
df = DataFrame(d)
df

one = df.groupby('letter')
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
Exemple #59
0
import numpy as np
from pandas import Series, DataFrame

# 重塑和轴向索引
# 重新排列表格型数据的基础运算。也称为重塑reshape或轴向旋转pivot

# 重塑层次化索引
# stack 将数据的列旋转为行
# unstack 将数据的行旋转为列
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
print data

# 使用stack方法,将列旋转为行,得到一个Series
result = data.stack()
print result

# 对于一个层次化的Series,可以使用unstack来重排为一个DataFrame
# 默认情况是最内层
print result.unstack()
# 可以通过传入参数分层级别的编号或者名称来对别的级别的unstack操作
print result.unstack(0)
print result.unstack('state')

# 如果不是所有级别值都能在各分组中找到的话,那么unstack会引入缺失值
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print data2
print data2.unstack()
Exemple #60
0
from bokeh.charts import Bar, output_file, show, vplot
from numpy.random import rand
from pandas import DataFrame

N = 10
data = DataFrame({'A': rand(N), 'B': rand(N), 'C': rand(N)})
# Stack columns A,B,C and convert the multiindices to columns
sdata = data.stack().reset_index()
sdata.columns = ['labels', 'stack', 'values']

bar = Bar(sdata, values='values', label='labels', stack='stack', legend='top_right')
bar2 = Bar(sdata, values='values', label='labels', stack='stack', legend='top_right')
bar2.x_range = bar.x_range  # Link the x axes

output_file("stacked_bar.html")
show(vplot(bar, bar2))