Ejemplo n.º 1
0
    def test_drop_names(self):
        df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]],
                       index=['a', 'b', 'c'],
                       columns=['d', 'e', 'f'])
        df.index.name, df.columns.name = 'first', 'second'
        df_dropped_b = df.drop('b')
        df_dropped_e = df.drop('e', axis=1)
        df_inplace_b, df_inplace_e = df.copy(), df.copy()
        df_inplace_b.drop('b', inplace=True)
        df_inplace_e.drop('e', axis=1, inplace=True)
        for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
            self.assertEqual(obj.index.name, 'first')
            self.assertEqual(obj.columns.name, 'second')
        self.assertEqual(list(df.columns), ['d', 'e', 'f'])

        self.assertRaises(ValueError, df.drop, ['g'])
        self.assertRaises(ValueError, df.drop, ['g'], 1)

        # errors = 'ignore'
        dropped = df.drop(['g'], errors='ignore')
        expected = Index(['a', 'b', 'c'], name='first')
        self.assert_index_equal(dropped.index, expected)

        dropped = df.drop(['b', 'g'], errors='ignore')
        expected = Index(['a', 'c'], name='first')
        self.assert_index_equal(dropped.index, expected)

        dropped = df.drop(['g'], axis=1, errors='ignore')
        expected = Index(['d', 'e', 'f'], name='second')
        self.assert_index_equal(dropped.columns, expected)

        dropped = df.drop(['d', 'g'], axis=1, errors='ignore')
        expected = Index(['e', 'f'], name='second')
        self.assert_index_equal(dropped.columns, expected)
    def test_drop_names(self):
        df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=["a", "b", "c"], columns=["d", "e", "f"])
        df.index.name, df.columns.name = "first", "second"
        df_dropped_b = df.drop("b")
        df_dropped_e = df.drop("e", axis=1)
        df_inplace_b, df_inplace_e = df.copy(), df.copy()
        df_inplace_b.drop("b", inplace=True)
        df_inplace_e.drop("e", axis=1, inplace=True)
        for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
            self.assertEqual(obj.index.name, "first")
            self.assertEqual(obj.columns.name, "second")
        self.assertEqual(list(df.columns), ["d", "e", "f"])

        self.assertRaises(ValueError, df.drop, ["g"])
        self.assertRaises(ValueError, df.drop, ["g"], 1)

        # errors = 'ignore'
        dropped = df.drop(["g"], errors="ignore")
        expected = Index(["a", "b", "c"], name="first")
        self.assert_index_equal(dropped.index, expected)

        dropped = df.drop(["b", "g"], errors="ignore")
        expected = Index(["a", "c"], name="first")
        self.assert_index_equal(dropped.index, expected)

        dropped = df.drop(["g"], axis=1, errors="ignore")
        expected = Index(["d", "e", "f"], name="second")
        self.assert_index_equal(dropped.columns, expected)

        dropped = df.drop(["d", "g"], axis=1, errors="ignore")
        expected = Index(["e", "f"], name="second")
        self.assert_index_equal(dropped.columns, expected)
Ejemplo n.º 3
0
    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame([])
        df2 = DataFrame([])
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)
Ejemplo n.º 4
0
    def test_assign(self):
        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        original = df.copy()
        result = df.assign(C=df.B / df.A)
        expected = df.copy()
        expected['C'] = [4, 2.5, 2]
        assert_frame_equal(result, expected)

        # lambda syntax
        result = df.assign(C=lambda x: x.B / x.A)
        assert_frame_equal(result, expected)

        # original is unmodified
        assert_frame_equal(df, original)

        # Non-Series array-like
        result = df.assign(C=[4, 2.5, 2])
        assert_frame_equal(result, expected)
        # original is unmodified
        assert_frame_equal(df, original)

        result = df.assign(B=df.B / df.A)
        expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
        assert_frame_equal(result, expected)

        # overwrite
        result = df.assign(A=df.A + df.B)
        expected = df.copy()
        expected['A'] = [5, 7, 9]
        assert_frame_equal(result, expected)

        # lambda
        result = df.assign(A=lambda x: x.A + x.B)
        assert_frame_equal(result, expected)
Ejemplo n.º 5
0
class Equals(object):

    def setup(self):
        N = 10**3
        self.float_df = DataFrame(np.random.randn(N, N))
        self.float_df_nan = self.float_df.copy()
        self.float_df_nan.iloc[-1, -1] = np.nan

        self.object_df = DataFrame('foo', index=range(N), columns=range(N))
        self.object_df_nan = self.object_df.copy()
        self.object_df_nan.iloc[-1, -1] = np.nan

        self.nonunique_cols = self.object_df.copy()
        self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns)
        self.nonunique_cols_nan = self.nonunique_cols.copy()
        self.nonunique_cols_nan.iloc[-1, -1] = np.nan

    def time_frame_float_equal(self):
        self.float_df.equals(self.float_df)

    def time_frame_float_unequal(self):
        self.float_df.equals(self.float_df_nan)

    def time_frame_nonunique_equal(self):
        self.nonunique_cols.equals(self.nonunique_cols)

    def time_frame_nonunique_unequal(self):
        self.nonunique_cols.equals(self.nonunique_cols_nan)

    def time_frame_object_equal(self):
        self.object_df.equals(self.object_df)

    def time_frame_object_unequal(self):
        self.object_df.equals(self.object_df_nan)
Ejemplo n.º 6
0
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")],
            names=["exp", "animal", "hair_length"],
        )
        df = DataFrame(randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=["animal", "hair_length"])
        exp_hair_stacked = df.stack(level=["exp", "hair_length"])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ["exp", "animal", 1]
        assert_frame_equal(df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False)

        # When mixed types are passed and the ints are not level
        # names, raise
        self.assertRaises(ValueError, df2.stack, level=["animal", 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ["exp", "animal", 0]
        assert_frame_equal(df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False)
Ejemplo n.º 7
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df[2][:2] = np.nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, thresh=5, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, thresh=4, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=1, thresh=4)
        assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        assert_frame_equal(dropped, df)
        assert_frame_equal(inp, df)

        # all
        dropped = df.dropna(axis=1, how='all')
        assert_frame_equal(dropped, df)

        df[2] = np.nan
        dropped = df.dropna(axis=1, how='all')
        expected = df.loc[:, [0, 1, 3]]
        assert_frame_equal(dropped, expected)

        # bad input
        msg = ("No axis named 3 for object type"
               " <class 'pandas.core.frame.DataFrame'>")
        with pytest.raises(ValueError, match=msg):
            df.dropna(axis=3)
Ejemplo n.º 8
0
    def test_stack_mixed_levels(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'),
             ('A', 'dog', 'short'), ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length']
        )
        df = DataFrame(randn(4, 4), columns=columns)

        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        # GH #8584: Need to check that stacking works when a number
        # is passed that is both a level name and in the range of
        # the level numbers
        df2 = df.copy()
        df2.columns.names = ['exp', 'animal', 1]
        assert_frame_equal(df2.stack(level=['animal', 1]),
                           animal_hair_stacked, check_names=False)
        assert_frame_equal(df2.stack(level=['exp', 1]),
                           exp_hair_stacked, check_names=False)

        # When mixed types are passed and the ints are not level
        # names, raise
        pytest.raises(ValueError, df2.stack, level=['animal', 0])

        # GH #8584: Having 0 in the level names could raise a
        # strange error about lexsort depth
        df3 = df.copy()
        df3.columns.names = ['exp', 'animal', 0]
        assert_frame_equal(df3.stack(level=['animal', 0]),
                           animal_hair_stacked, check_names=False)
Ejemplo n.º 9
0
    def test_stack_int_level_names(self):
        columns = MultiIndex.from_tuples(
            [('A', 'cat', 'long'), ('B', 'cat', 'long'),
             ('A', 'dog', 'short'), ('B', 'dog', 'short')],
            names=['exp', 'animal', 'hair_length']
        )
        df = DataFrame(randn(4, 4), columns=columns)

        exp_animal_stacked = df.stack(level=['exp', 'animal'])
        animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
        exp_hair_stacked = df.stack(level=['exp', 'hair_length'])

        df2 = df.copy()
        df2.columns.names = [0, 1, 2]
        assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
                           check_names=False)

        # Out-of-order int column names
        df3 = df.copy()
        df3.columns.names = [2, 0, 1]
        assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
                           check_names=False)
        assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
                           check_names=False)
Ejemplo n.º 10
0
    def test_interp_alt_scipy(self):
        tm._skip_if_no_scipy()
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        result = df.interpolate(method='barycentric')
        expected = df.copy()
        expected.loc[2, 'A'] = 3
        expected.loc[5, 'A'] = 6
        assert_frame_equal(result, expected)

        result = df.interpolate(method='barycentric', downcast='infer')
        assert_frame_equal(result, expected.astype(np.int64))

        result = df.interpolate(method='krogh')
        expectedk = df.copy()
        expectedk['A'] = expected['A']
        assert_frame_equal(result, expectedk)

        _skip_if_no_pchip()
        import scipy
        result = df.interpolate(method='pchip')
        expected.loc[2, 'A'] = 3

        if LooseVersion(scipy.__version__) >= '0.17.0':
            expected.loc[5, 'A'] = 6.0
        else:
            expected.loc[5, 'A'] = 6.125

        assert_frame_equal(result, expected)
Ejemplo n.º 11
0
    def test_interp_alt_scipy(self):
        tm._skip_if_no_scipy()
        df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
                        'C': [1, 2, 3, 5, 8, 13, 21]})
        result = df.interpolate(method='barycentric')
        expected = df.copy()
        expected['A'].iloc[2] = 3
        expected['A'].iloc[5] = 6
        assert_frame_equal(result, expected)

        result = df.interpolate(method='barycentric', downcast='infer')
        assert_frame_equal(result, expected.astype(np.int64))

        result = df.interpolate(method='krogh')
        expectedk = df.copy()
        # expectedk['A'].iloc[2] = 3
        # expectedk['A'].iloc[5] = 6
        expectedk['A'] = expected['A']
        assert_frame_equal(result, expectedk)

        _skip_if_no_pchip()
        result = df.interpolate(method='pchip')
        expected['A'].iloc[2] = 3
        expected['A'].iloc[5] = 6.125
        assert_frame_equal(result, expected)
Ejemplo n.º 12
0
class Append(object):

    goal_time = 0.2

    def setup(self):
        self.df1 = DataFrame(np.random.randn(10000, 4),
                             columns=['A', 'B', 'C', 'D'])
        self.df2 = self.df1.copy()
        self.df2.index = np.arange(10000, 20000)
        self.mdf1 = self.df1.copy()
        self.mdf1['obj1'] = 'bar'
        self.mdf1['obj2'] = 'bar'
        self.mdf1['int1'] = 5
        try:
            with warnings.catch_warnings(record=True):
                self.mdf1.consolidate(inplace=True)
        except:
            pass
        self.mdf2 = self.mdf1.copy()
        self.mdf2.index = self.df2.index

    def time_append_homogenous(self):
        self.df1.append(self.df2)

    def time_append_mixed(self):
        self.mdf1.append(self.mdf2)
Ejemplo n.º 13
0
    def test_frame_to_period(self):
        K = 5
        from pandas.tseries.period import period_range

        dr = date_range('1/1/2000', '1/1/2001')
        pr = period_range('1/1/2000', '1/1/2001')
        df = DataFrame(randn(len(dr), K), index=dr)
        df['mix'] = 'a'

        pts = df.to_period()
        exp = df.copy()
        exp.index = pr
        assert_frame_equal(pts, exp)

        pts = df.to_period('M')
        tm.assert_index_equal(pts.index, exp.index.asfreq('M'))

        df = df.T
        pts = df.to_period(axis=1)
        exp = df.copy()
        exp.columns = pr
        assert_frame_equal(pts, exp)

        pts = df.to_period('M', axis=1)
        tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))

        self.assertRaises(ValueError, df.to_period, axis=2)
Ejemplo n.º 14
0
    def test_to_period(self):
        from pandas.tseries.period import period_range

        ts = _simple_ts('1/1/2000', '1/1/2001')

        pts = ts.to_period()
        exp = ts.copy()
        exp.index = period_range('1/1/2000', '1/1/2001')
        assert_series_equal(pts, exp)

        pts = ts.to_period('M')
        exp.index = exp.index.asfreq('M')
        tm.assert_index_equal(pts.index, exp.index.asfreq('M'))
        assert_series_equal(pts, exp)

        # GH 7606 without freq
        idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03',
                             '2011-01-04'])
        exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03',
                                  '2011-01-04'], freq='D')

        s = Series(np.random.randn(4), index=idx)
        expected = s.copy()
        expected.index = exp_idx
        assert_series_equal(s.to_period(), expected)

        df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx)
        expected = df.copy()
        expected.index = exp_idx
        assert_frame_equal(df.to_period(), expected)

        expected = df.copy()
        expected.columns = exp_idx
        assert_frame_equal(df.to_period(axis=1), expected)
Ejemplo n.º 15
0
    def test_copy(self):
        """Check inplace/copy behavior of link_df, link_df_iter"""
        # One 1D stepper
        N = 5
        f = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        f_inplace = f.copy()
        expected = f.copy()
        expected['particle'] = np.zeros(N)

        # Should add particle column in-place
        # UNLESS diagnostics are enabled (or input dataframe is not writeable)
        actual = self.link_df(f_inplace, 5)
        assert_frame_equal(actual, expected)
        if self.do_diagnostics:
            assert 'particle' not in f_inplace.columns
        else:
            assert_frame_equal(actual, f_inplace)

        # When DataFrame is actually a view, link_df should produce a warning
        # and then copy the DataFrame. This only happens for pandas >= 0.16.
        if is_pandas_recent:
            with assert_produces_warning(UserWarning):
                actual = self.link_df(f[f['frame'] > 0], 5)
            assert 'particle' not in f.columns

        # Should copy
        actual = self.link_df(f, 5, copy_features=True)
        assert_frame_equal(actual, expected)
        assert 'particle' not in f.columns

        # Should copy
        actual_iter = self.link_df_iter(f, 5, hash_size=(10, 2))
        assert_frame_equal(actual_iter, expected)
        assert 'particle' not in f.columns
Ejemplo n.º 16
0
    def test_copy(self):
        """Check inplace/copy behavior of link_df, link_df_iter"""
        # One 1D stepper
        N = 5
        f = DataFrame({'x': np.arange(N), 'y': np.ones(N), 'frame': np.arange(N)})
        f_inplace = f.copy()
        expected = f.copy()
        expected['particle'] = np.zeros(N)

        # Should add particle column in-place
        # UNLESS diagnostics are enabled (or input dataframe is not writeable)
        actual = self.link_df(f_inplace, 5)
        assert_traj_equal(actual, expected)
        if self.do_diagnostics:
            assert 'particle' not in f_inplace.columns
        else:
            assert_traj_equal(actual, f_inplace)

        # Should copy
        actual = self.link_df(f, 5, copy_features=True)
        assert_traj_equal(actual, expected)
        assert 'particle' not in f.columns

        # Should copy
        actual_iter = self.link_df_iter(f, 5, hash_size=(10, 2))
        assert_traj_equal(actual_iter, expected)
        assert 'particle' not in f.columns
Ejemplo n.º 17
0
    def test_frame_to_period(self):
        K = 5

        dr = date_range('1/1/2000', '1/1/2001')
        pr = period_range('1/1/2000', '1/1/2001')
        df = DataFrame(np.random.randn(len(dr), K), index=dr)
        df['mix'] = 'a'

        pts = df.to_period()
        exp = df.copy()
        exp.index = pr
        assert_frame_equal(pts, exp)

        pts = df.to_period('M')
        tm.assert_index_equal(pts.index, exp.index.asfreq('M'))

        df = df.T
        pts = df.to_period(axis=1)
        exp = df.copy()
        exp.columns = pr
        assert_frame_equal(pts, exp)

        pts = df.to_period('M', axis=1)
        tm.assert_index_equal(pts.columns, exp.columns.asfreq('M'))

        msg = ("No axis named 2 for object type"
               " <class 'pandas.core.frame.DataFrame'>")
        with pytest.raises(ValueError, match=msg):
            df.to_period(axis=2)
Ejemplo n.º 18
0
    def test_setitem_with_datetime_tz(self):
        # 16889
        # support .loc with alignment and tz-aware DatetimeIndex
        mask = np.array([True, False, True, False])

        idx = date_range('20010101', periods=4, tz='UTC')
        df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64')

        result = df.copy()
        result.loc[mask, :] = df.loc[mask, :]
        tm.assert_frame_equal(result, df)

        result = df.copy()
        result.loc[mask] = df.loc[mask]
        tm.assert_frame_equal(result, df)

        idx = date_range('20010101', periods=4)
        df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64')

        result = df.copy()
        result.loc[mask, :] = df.loc[mask, :]
        tm.assert_frame_equal(result, df)

        result = df.copy()
        result.loc[mask] = df.loc[mask]
        tm.assert_frame_equal(result, df)
Ejemplo n.º 19
0
    def test_frame_setitem_multi_column(self):
        df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'],
                                              [0, 1, 0, 1]])

        cp = df.copy()
        cp['a'] = cp['b']
        tm.assert_frame_equal(cp['a'], cp['b'])

        # set with ndarray
        cp = df.copy()
        cp['a'] = cp['b'].values
        tm.assert_frame_equal(cp['a'], cp['b'])

        # ---------------------------------------
        # #1803
        columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')])
        df = DataFrame(index=[1, 3, 5], columns=columns)

        # Works, but adds a column instead of updating the two existing ones
        df['A'] = 0.0  # Doesn't work
        assert (df['A'].values == 0).all()

        # it broadcasts
        df['B', '1'] = [1, 2, 3]
        df['A'] = df['B', '1']

        sliced_a1 = df['A', '1']
        sliced_a2 = df['A', '2']
        sliced_b1 = df['B', '1']
        tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False)
        tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False)
        assert sliced_a1.name == ('A', '1')
        assert sliced_a2.name == ('A', '2')
        assert sliced_b1.name == ('B', '1')
Ejemplo n.º 20
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df[2][:2] = nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, thresh=5, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, thresh=4, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=1, thresh=4)
        assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        assert_frame_equal(dropped, df)
        assert_frame_equal(inp, df)

        # all
        dropped = df.dropna(axis=1, how='all')
        assert_frame_equal(dropped, df)

        df[2] = nan
        dropped = df.dropna(axis=1, how='all')
        expected = df.loc[:, [0, 1, 3]]
        assert_frame_equal(dropped, expected)

        # bad input
        pytest.raises(ValueError, df.dropna, axis=3)
Ejemplo n.º 21
0
    def test_inplace_ops_identity(self):

        # GH 5104
        # make sure that we are actually changing the object
        s_orig = Series([1, 2, 3])
        df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))

        # no dtype change
        s = s_orig.copy()
        s2 = s
        s += 1
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1, s)
        self.assertIs(s, s2)
        self.assertIs(s._data, s2._data)

        df = df_orig.copy()
        df2 = df
        df += 1
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1, df)
        self.assertIs(df, df2)
        self.assertIs(df._data, df2._data)

        # dtype change
        s = s_orig.copy()
        s2 = s
        s += 1.5
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1.5, s)

        df = df_orig.copy()
        df2 = df
        df += 1.5
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1.5, df)
        self.assertIs(df, df2)
        self.assertIs(df._data, df2._data)

        # mixed dtype
        arr = np.random.randint(0, 10, size=5)
        df_orig = DataFrame({"A": arr.copy(), "B": "foo"})
        df = df_orig.copy()
        df2 = df
        df["A"] += 1
        expected = DataFrame({"A": arr.copy() + 1, "B": "foo"})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        self.assertIs(df._data, df2._data)

        df = df_orig.copy()
        df2 = df
        df["A"] += 1.5
        expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        self.assertIs(df._data, df2._data)
Ejemplo n.º 22
0
    def test_inplace_ops_identity(self):

        # GH 5104
        # make sure that we are actually changing the object
        s_orig = Series([1, 2, 3])
        df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5))

        # no dtype change
        s = s_orig.copy()
        s2 = s
        s += 1
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1, s)
        assert s is s2
        assert s._data is s2._data

        df = df_orig.copy()
        df2 = df
        df += 1
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1, df)
        assert df is df2
        assert df._data is df2._data

        # dtype change
        s = s_orig.copy()
        s2 = s
        s += 1.5
        assert_series_equal(s, s2)
        assert_series_equal(s_orig + 1.5, s)

        df = df_orig.copy()
        df2 = df
        df += 1.5
        assert_frame_equal(df, df2)
        assert_frame_equal(df_orig + 1.5, df)
        assert df is df2
        assert df._data is df2._data

        # mixed dtype
        arr = np.random.randint(0, 10, size=5)
        df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'})
        df = df_orig.copy()
        df2 = df
        df['A'] += 1
        expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data

        df = df_orig.copy()
        df2 = df
        df['A'] += 1.5
        expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'})
        assert_frame_equal(df, expected)
        assert_frame_equal(df2, expected)
        assert df._data is df2._data
Ejemplo n.º 23
0
    def test_regex_replace_dict_mixed(self):
        mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}
        dfmix = DataFrame(mix)

        # dicts
        # single dict {re1: v1}, search the whole frame
        # need test for this...

        # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
        # frame
        res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan},
                     inplace=True, regex=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', np.nan, np.nan], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
        # whole frame
        res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True)
        res2 = dfmix.copy()
        res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True,
                     regex=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'})
        res2 = dfmix.copy()
        res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'},
                     inplace=True)
        expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        # scalar -> dict
        # to_replace regex, {value: value}
        expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
                           mix['c']})
        res = dfmix.replace('a', {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace('a', {'b': np.nan}, regex=True, inplace=True)
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)

        res = dfmix.replace('a', {'b': np.nan}, regex=True)
        res2 = dfmix.copy()
        res2.replace(regex='a', value={'b': np.nan}, inplace=True)
        expec = DataFrame({'a': mix['a'], 'b': [np.nan, 'b', '.', '.'], 'c':
                           mix['c']})
        assert_frame_equal(res, expec)
        assert_frame_equal(res2, expec)
Ejemplo n.º 24
0
    def test_interp_inplace(self):
        df = DataFrame({'a': [1., 2., np.nan, 4.]})
        expected = DataFrame({'a': [1., 2., 3., 4.]})
        result = df.copy()
        result['a'].interpolate(inplace=True)
        assert_frame_equal(result, expected)

        result = df.copy()
        result['a'].interpolate(inplace=True, downcast='infer')
        assert_frame_equal(result, expected.astype('int64'))
Ejemplo n.º 25
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({'A': date_range('20130101', periods=3,
                                        tz='US/Eastern'),
                        'B': [0, np.nan, 2]})
        result = df.replace(np.nan, 1)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': Series([0, 1, 2], dtype='float64')})
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': [np.nan, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Eastern'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Pacific'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)
Ejemplo n.º 26
0
    def test_replace_input_formats(self):
        # both dicts
        to_rep = {'A': np.nan, 'B': 0, 'C': ''}
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(to_rep, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], values[k])
        assert_frame_equal(filled, DataFrame(expected))

        result = df.replace([0, 2, 5], [5, 2, 0])
        expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0],
                              'C': ['', 'asdf', 'fd']})
        assert_frame_equal(result, expected)

        # dict to scalar
        filled = df.replace(to_rep, 0)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(to_rep[k], 0)
        assert_frame_equal(filled, DataFrame(expected))

        self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, ''])

        # scalar to dict
        values = {'A': 0, 'B': -1, 'C': 'missing'}
        df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5],
                        'C': ['', 'asdf', 'fd']})
        filled = df.replace(np.nan, values)
        expected = {}
        for k, v in compat.iteritems(df):
            expected[k] = v.replace(np.nan, values[k])
        assert_frame_equal(filled, DataFrame(expected))

        # list to list
        to_rep = [np.nan, 0, '']
        values = [-2, -1, 'missing']
        result = df.replace(to_rep, values)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], values[i], inplace=True)
        assert_frame_equal(result, expected)

        self.assertRaises(ValueError, df.replace, to_rep, values[1:])

        # list to scalar
        to_rep = [np.nan, 0, '']
        result = df.replace(to_rep, -1)
        expected = df.copy()
        for i in range(len(to_rep)):
            expected.replace(to_rep[i], -1, inplace=True)
        assert_frame_equal(result, expected)
Ejemplo n.º 27
0
    def test_multi_assign(self):

        # GH 3626, an assignment of a sub-df to a df
        df = DataFrame({'FC': ['a', 'b', 'a', 'b', 'a', 'b'],
                        'PF': [0, 0, 0, 0, 1, 1],
                        'col1': list(range(6)),
                        'col2': list(range(6, 12)),
                        })
        df.iloc[1, 0] = np.nan
        df2 = df.copy()

        mask = ~df2.FC.isna()
        cols = ['col1', 'col2']

        dft = df2 * 2
        dft.iloc[3, 3] = np.nan

        expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
                              'PF': [0, 0, 0, 0, 1, 1],
                              'col1': Series([0, 1, 4, 6, 8, 10]),
                              'col2': [12, 7, 16, np.nan, 20, 22]})

        # frame on rhs
        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        # with an ndarray on rhs
        # coerces to float64 because values has float64 dtype
        # GH 14001
        expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
                              'PF': [0, 0, 0, 0, 1, 1],
                              'col1': [0., 1., 4., 6., 8., 10.],
                              'col2': [12, 7, 16, np.nan, 20, 22]})
        df2 = df.copy()
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)

        # broadcasting on the rhs is required
        df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[
                       0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7]))

        expected = df.copy()
        mask = expected['A'] == 0
        for col in ['A', 'B']:
            expected.loc[mask, col] = df['D']

        df.loc[df['A'] == 0, ['A', 'B']] = df['D']
        tm.assert_frame_equal(df, expected)
Ejemplo n.º 28
0
    def test_ix_loc_setitem_consistency(self):

        # GH 5771
        # loc with slice and series
        s = Series(0, index=[4, 5, 6])
        s.loc[4:5] += 1
        expected = Series([1, 1, 0], index=[4, 5, 6])
        tm.assert_series_equal(s, expected)

        # GH 5928
        # chained indexing assignment
        df = DataFrame({'a': [0, 1, 2]})
        expected = df.copy()
        with catch_warnings(record=True):
            expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a']

        with catch_warnings(record=True):
            df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]]
        tm.assert_frame_equal(df, expected)

        df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]})
        with catch_warnings(record=True):
            df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype(
                'float64') + 0.5
        expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]})
        tm.assert_frame_equal(df, expected)

        # GH 8607
        # ix setitem consistency
        df = DataFrame({'delta': [1174, 904, 161],
                        'elapsed': [7673, 9277, 1470],
                        'timestamp': [1413840976, 1413842580, 1413760580]})
        expected = DataFrame({'delta': [1174, 904, 161],
                              'elapsed': [7673, 9277, 1470],
                              'timestamp': pd.to_datetime(
                                  [1413840976, 1413842580, 1413760580],
                                  unit='s')
                              })

        df2 = df.copy()
        df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        tm.assert_frame_equal(df2, expected)

        df2 = df.copy()
        df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        tm.assert_frame_equal(df2, expected)

        df2 = df.copy()
        with catch_warnings(record=True):
            df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s')
        tm.assert_frame_equal(df2, expected)
Ejemplo n.º 29
0
 def test_regex_replace_regex_list_to_numeric(self):
     mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
     df = DataFrame(mix)
     res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True)
     res2 = df.copy()
     res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True)
     res3 = df.copy()
     res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True)
     expec = DataFrame({'a': mix['a'], 'b': ['a', 0, 0, 0], 'c': ['a', 0,
                                                                  nan,
                                                                  'd']})
     assert_frame_equal(res, expec)
     assert_frame_equal(res2, expec)
     assert_frame_equal(res3, expec)
Ejemplo n.º 30
0
 def test_regex_replace_str_to_numeric(self):
     # what happens when you try to replace a numeric value with a regex?
     mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']}
     df = DataFrame(mix)
     res = df.replace(r'\s*\.\s*', 0, regex=True)
     res2 = df.copy()
     res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True)
     res3 = df.copy()
     res3.replace(regex=r'\s*\.\s*', value=0, inplace=True)
     expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', 0, 0], 'c':
                        mix['c']})
     assert_frame_equal(res, expec)
     assert_frame_equal(res2, expec)
     assert_frame_equal(res3, expec)
Ejemplo n.º 31
0
    def test_combine_first_mixed_bug(self):
        idx = Index(['a', 'b', 'c', 'e'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})

        idx = Index(['a', 'b', 'c', 'f'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})

        combined = frame1.combine_first(frame2)
        assert len(combined.columns) == 5

        # gh 3016 (same as in update)
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        result = df.combine_first(other)
        assert_frame_equal(result, df)

        df.loc[0, 'A'] = np.nan
        result = df.combine_first(other)
        df.loc[0, 'A'] = 45
        assert_frame_equal(result, df)

        # doc example
        df1 = DataFrame({
            'A': [1., np.nan, 3., 5., np.nan],
            'B': [np.nan, 2., 3., np.nan, 6.]
        })

        df2 = DataFrame({
            'A': [5., 2., 4., np.nan, 3., 7.],
            'B': [np.nan, np.nan, 3., 4., 6., 8.]
        })

        result = df1.combine_first(df2)
        expected = DataFrame({
            'A': [1, 2, 3, 5, 3, 7.],
            'B': [np.nan, 2, 3, 4, 6, 8]
        })
        assert_frame_equal(result, expected)

        # GH3552, return object dtype with bools
        df1 = DataFrame([[np.nan, 3., True], [-4.6, np.nan, True],
                         [np.nan, 7., False]])
        df2 = DataFrame([[-42.6, np.nan, True], [-5., 1.6, False]],
                        index=[1, 2])

        result = df1.combine_first(df2)[2]
        expected = Series([True, True, False], name=2)
        assert_series_equal(result, expected)

        # GH 3593, converting datetime64[ns] incorrecly
        df0 = DataFrame({
            "a":
            [datetime(2000, 1, 1),
             datetime(2000, 1, 2),
             datetime(2000, 1, 3)]
        })
        df1 = DataFrame({"a": [None, None, None]})
        df2 = df1.combine_first(df0)
        assert_frame_equal(df2, df0)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)

        df0 = DataFrame({
            "a":
            [datetime(2000, 1, 1),
             datetime(2000, 1, 2),
             datetime(2000, 1, 3)]
        })
        df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
        df2 = df1.combine_first(df0)
        result = df0.copy()
        result.iloc[0, :] = df1.iloc[0, :]
        assert_frame_equal(df2, result)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)
Ejemplo n.º 32
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        def f():
            s.iloc[3] = 5.

        self.assertRaises(IndexError, f)

        def f():
            s.iat[3] = 5.

        self.assertRaises(IndexError, f)

        # ## frame ##

        df_orig = DataFrame(np.arange(6).reshape(3, 2),
                            columns=['A', 'B'],
                            dtype='int64')

        # iloc/iat raise
        df = df_orig.copy()

        def f():
            df.iloc[4, 2] = 5.

        self.assertRaises(IndexError, f)

        def f():
            df.iat[4, 2] = 5.

        self.assertRaises(IndexError, f)

        # row setting where it exists
        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        with catch_warnings(record=True):
            # ## panel ##
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')

            # panel setting via item
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'],
                           dtype='float64')
            expected = p_orig.copy()
            expected['Item3'] = expected['Item1']
            p = p_orig.copy()
            p.loc['Item3'] = p['Item1']
            tm.assert_panel_equal(p, expected)

            # panel with aligned series
            expected = p_orig.copy()
            expected = expected.transpose(2, 1, 0)
            expected['C'] = DataFrame(
                {
                    'Item1': [30, 30, 30, 30],
                    'Item2': [32, 32, 32, 32]
                },
                index=p_orig.major_axis)
            expected = expected.transpose(2, 1, 0)
            p = p_orig.copy()
            p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items)
            tm.assert_panel_equal(p, expected)

        # GH 8473
        dates = date_range('1/1/2000', periods=8)
        df_orig = DataFrame(np.random.randn(8, 4),
                            index=dates,
                            columns=['A', 'B', 'C', 'D'])

        expected = pd.concat(
            [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])])
        df = df_orig.copy()
        df.loc[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + 1])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
Ejemplo n.º 33
0
def write_plink_or_bolt_file(input_df: pd.DataFrame,
                             path_or_buf: Union[Optional[str], os.PathLike,
                                                io.IOBase],
                             binary_column_mapping: Dict[str,
                                                         Dict[int,
                                                              Union[int,
                                                                    float]]],
                             missing_value: Union[str, int, float],
                             cast_ints: bool = True) -> Optional[str]:
    """Writes a PLINK/BOLT formatted file of `input_df` to `path`.

  This is the complementary function to `load_plink_or_bolt_file`. In
  particular, the `binary_column_mapping` input is expected to be created by
  the loading function to ensure that non-DeepNull-predicted columns retain the
  same values as in the input data.

  Args:
    input_df: The DataFrame to write to TSV.
    path_or_buf: The path to write the TSV to.
    binary_column_mapping: The mapping from binary column name to the mapping of
      the binary represenation of that column in `input_df` to the original
      binary representation of the data.
    missing_value: The missing value to use when writing out. Typically 'NA' for
      BOLT or Regenie, and possibly -9 for PLINK.
    cast_ints: If True, any fields that contain only integer values are written
      as integers.

  Returns:
    The result as a string if `path_or_buf` is None, otherwise None.
  """
    # Sanity check.
    if list(input_df.columns[:2]) != ['FID', 'IID']:
        raise ValueError('"FID" and "IID" required to start PLINK/BOLT file: '
                         f'{input_df.columns}')

    # Make a copy since we mutate, then transform binary fields to their original
    # representation.
    df = input_df.copy()
    for column, mapping in binary_column_mapping.items():
        df[column] = df[column].replace(mapping)

    if cast_ints:
        for column in df.columns:
            values = df[column]
            mask = ~values.isnull()
            try:
                int_values = values[mask].astype(int)
            except ValueError:
                # This is a non-numeric field, leave it as-is.
                continue
            else:
                if (values[mask] == int_values).all():
                    # All non-null values are integers. Convert to the 'Int64' type that
                    # allows nullable integers. This requires nulls to use the pd.NA value
                    # rather than np.nan.
                    df[column] = values.fillna(pd.NA).astype('Int64')

    return df.to_csv(path_or_buf,
                     sep='\t',
                     index=False,
                     na_rep=str(missing_value))
Ejemplo n.º 34
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the transforms to the dataframe."""

        X = X.copy()

        #We could do some NLP here to get sentiments scores on the captions, for example

        def __image_size_check(image_loc) -> float:

            image = mpimg.imread(image_loc)

            try:
                (h, w, n) = image.shape
            except:
                return np.nan
            if w < 800:
                return np.nan
            else:
                return (h, w, n)

        def __difference_from_mean_likes_per_follower(row) -> float:

            name = row['credits']
            ndays = (self.now - row['postdate']).days

            if ndays < 1:
                ndays = 1

            return (row['nlikes_per_follower'] - \
                self.summary[self.summary['credits']==name]['nlikes_per_follower'].values[0])/(self.summary[self.summary['credits']==name]['nlikes_per_follower'].values[0]*ndays)

        def __difference_from_mean_comments_per_follower(row) -> float:

            name = row['credits']
            ndays = (self.now - row['postdate']).days

            if ndays < 1:
                ndays = 1

            return (row['ncomments_per_follower'] - \
                self.summary[self.summary['credits']==name]['ncomments_per_follower'].values[0])/(self.summary[self.summary['credits']==name]['ncomments_per_follower'].values[0]*ndays)

        def __categorize_parks(parkid) -> int:

            return self.names_dir[parkid]

        def __post_rank(row,
                        likes_weight=config.LIKES_WEIGHT,
                        comments_weight=config.COMMENT_WEIGHT) -> float:

            return row['mean_nlikes_diff'] * likes_weight + row[
                'mean_ncomments_diff'] * comments_weight

        #def __previously_posted(floc):

        #    if floc in self.previous_posts:
        #        return np.nan
        #    else:
        #        return 1

        X['postdate'] = pd.to_datetime(X['postdate'])
        X['mean_nlikes_diff'] = X.apply(
            lambda row: __difference_from_mean_likes_per_follower(row), axis=1)
        X['mean_ncomments_diff'] = X.apply(
            lambda row: __difference_from_mean_comments_per_follower(row),
            axis=1)
        X['park_id'] = X['credits'].apply(__categorize_parks)
        X['rank'] = X.apply(lambda row: __post_rank(row), axis=1)
        X['image_size'] = X['Flocation'].apply(__image_size_check)

        return X
Ejemplo n.º 35
0
    def test_inplace_return_self(self):
        # re #1893

        data = DataFrame({
            'a': ['foo', 'bar', 'baz', 'qux'],
            'b': [0, 0, 1, 1],
            'c': [1, 2, 3, 4]
        })

        def _check_f(base, f):
            result = f(base)
            assert result is None

        # -----DataFrame-----

        # set_index
        f = lambda x: x.set_index('a', inplace=True)
        _check_f(data.copy(), f)

        # reset_index
        f = lambda x: x.reset_index(inplace=True)
        _check_f(data.set_index('a'), f)

        # drop_duplicates
        f = lambda x: x.drop_duplicates(inplace=True)
        _check_f(data.copy(), f)

        # sort
        f = lambda x: x.sort_values('b', inplace=True)
        _check_f(data.copy(), f)

        # sort_index
        f = lambda x: x.sort_index(inplace=True)
        _check_f(data.copy(), f)

        # fillna
        f = lambda x: x.fillna(0, inplace=True)
        _check_f(data.copy(), f)

        # replace
        f = lambda x: x.replace(1, 0, inplace=True)
        _check_f(data.copy(), f)

        # rename
        f = lambda x: x.rename({1: 'foo'}, inplace=True)
        _check_f(data.copy(), f)

        # -----Series-----
        d = data.copy()['c']

        # reset_index
        f = lambda x: x.reset_index(inplace=True, drop=True)
        _check_f(data.set_index('a')['c'], f)

        # fillna
        f = lambda x: x.fillna(0, inplace=True)
        _check_f(d.copy(), f)

        # replace
        f = lambda x: x.replace(1, 0, inplace=True)
        _check_f(d.copy(), f)

        # rename
        f = lambda x: x.rename({1: 'foo'}, inplace=True)
        _check_f(d.copy(), f)
 def _add_prefixes(self, causality_df: pd.DataFrame) -> pd.DataFrame:
     causes_causality_df = causality_df.copy()
     causes_causality_df[self.child_id_col] = causes_causality_df[self.child_id_col].apply(lambda x: 'causes_' + x)
     causedby_causality_df = causality_df.copy()
     causedby_causality_df[self.parent_id_col] = causedby_causality_df[self.parent_id_col].apply(lambda x: 'causedby_' + x)
     return pd.concat([causes_causality_df, causedby_causality_df]).reset_index(drop=True)
Ejemplo n.º 37
0
def mv_col_handling(
    data: pd.DataFrame,
    target: Optional[Union[str, pd.Series, List]] = None,
    mv_threshold: float = 0.1,
    corr_thresh_features: float = 0.5,
    corr_thresh_target: float = 0.3,
    return_details: bool = False,
) -> pd.DataFrame:
    """ Converts columns with a high ratio of missing values into binary features and \
    eventually drops them based on their correlation with other features and the \
    target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
    - 2) Identify high correlations of these features among themselves and with \
        other features in the dataset (above 'corr_thresh_features').
    - 3) Features with high ratio of missing values and high correlation among each \
        other are dropped unless they correlate reasonably well with the target \
        variable (above 'corr_thresh_target').

    Note: If no target is provided, the process exits after step two and drops columns \
    identified up to this point.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    target : Optional[Union[str, pd.Series, List]], optional
        Specify target for correlation. I.e. label column to generate only the \
        correlations between each feature and the label, by default None
    mv_threshold : float, optional
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
        than mv_threshold are candidates for dropping and undergo further analysis, by \
        default 0.1
    corr_thresh_features : float, optional
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
        features (with a high mv-ratio) is allowed to have with another feature. If \
        this threshold is overstepped, the feature undergoes further analysis, by \
        default 0.5
    corr_thresh_target : float, optional
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
        feature (i.e. feature with a high mv-ratio and high correlation to another \
        existing feature) with the target. If this threshold is not met the feature is \
        ultimately dropped, by default 0.3
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    """

    # Validate Inputs
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = (data_local[cols_mv].applymap(
        lambda x: 1 if not pd.isnull(x) else x).fillna(0))

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target,
                         colored=False).loc[high_corr_features]
        drop_cols = corrs.loc[
            abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    if return_details:
        return data, cols_mv, drop_cols

    return data
Ejemplo n.º 38
0
def fill_nan_mean(df: pd.DataFrame, columns: List[str]) -> None:
    df = df.copy()
    for column in columns:
        df[column] = df[column].fillna((df[column].mean()))
    return df
Ejemplo n.º 39
0
def _filter_feasible_rows(
    df: pd.DataFrame,
    optimization_config: OptimizationConfig,
    status_quo: Optional[Arm],
) -> pd.DataFrame:
    """Filter out arms that do not satisfy outcome constraints

    Looks at all arm data collected and removes rows corresponding to arms in
    which one or more of their associated metrics' 95% confidence interval
    falls outside of any outcome constraint's bounds (i.e. we are 95% sure the
    bound is not satisfied).
    """
    if len(optimization_config.outcome_constraints) < 1:
        return df

    name = df["metric_name"]

    # When SEM is NaN we should treat it as if it were 0
    sems = not_none(df["sem"].fillna(0))

    # Bounds computed for 95% confidence interval on Normal distribution
    lower_bound = df["mean"] - sems * 1.96
    upper_bound = df["mean"] + sems * 1.96

    # Only compute relativization if some constraints are relative
    rel_df = None
    rel_lower_bound = None
    rel_upper_bound = None
    if status_quo is not None and any(
            oc.relative for oc in optimization_config.outcome_constraints):
        # relativize_data expects all arms to come from the same trial, we need to
        # format the data as if it was.
        to_relativize = df.copy()
        to_relativize["trial_index"] = 0

        rel_df = relativize_data(data=Data(to_relativize),
                                 status_quo_name=status_quo.name).df.append(
                                     {
                                         "arm_name": "status_quo",
                                         "metric_name": status_quo.name,
                                         "mean": 0,
                                         "sem": 0,
                                     },
                                     ignore_index=True,
                                 )
        rel_sems = not_none(rel_df["sem"].fillna(0))
        rel_lower_bound = rel_df["mean"] - rel_sems * 1.96
        rel_upper_bound = rel_df["mean"] + rel_sems * 1.96

    # Nested function from OC -> Mask for consumption in later map/reduce from
    # [OC] -> Mask. Constraint relativity is handled inside so long as relative bounds
    # are set in surrounding closure (which will occur in proper experiment setup).
    def oc_mask(oc: OutcomeConstraint) -> pd.Series:
        name_match_mask = name == oc.metric.name

        if oc.relative:
            if rel_lower_bound is None or rel_upper_bound is None:
                logger.warning(
                    f"No status quo provided; relative constraint {oc} ignored."
                )
                return pd.Series(True, index=df.index)

            observed_lower_bound = rel_lower_bound
            observed_upper_bound = rel_upper_bound
        else:
            observed_lower_bound = lower_bound
            observed_upper_bound = upper_bound

        # Return True if metrics are different, or whether the confidence
        # interval is entirely not within the bound
        if oc.op == ComparisonOp.GEQ:
            return ~name_match_mask | observed_upper_bound > oc.bound
        else:
            return ~name_match_mask | observed_lower_bound < oc.bound

    mask = reduce(
        lambda left, right: left & right,
        map(oc_mask, optimization_config.outcome_constraints),
    )
    bad_arm_names = (df[~mask]["arm_name"].tolist()
                     if rel_df is None else rel_df[~mask]["arm_name"].tolist())
    feasible = df.loc[df["arm_name"].apply(lambda x: x not in bad_arm_names)]

    if feasible.empty:
        raise ValueError(
            "No points satisfied all outcome constraints within 95 percent" +
            "confidence interval")

    return feasible
Ejemplo n.º 40
0
class TestRollingTS:

    # rolling time-series friendly
    # xref GH13327

    def setup_method(self, method):

        self.regular = DataFrame({
            "A":
            date_range("20130101", periods=5, freq="s"),
            "B":
            range(5)
        }).set_index("A")

        self.ragged = DataFrame({"B": range(5)})
        self.ragged.index = [
            Timestamp("20130101 09:00:00"),
            Timestamp("20130101 09:00:02"),
            Timestamp("20130101 09:00:03"),
            Timestamp("20130101 09:00:05"),
            Timestamp("20130101 09:00:06"),
        ]

    def test_doc_string(self):

        df = DataFrame(
            {"B": [0, 1, 2, np.nan, 4]},
            index=[
                Timestamp("20130101 09:00:00"),
                Timestamp("20130101 09:00:02"),
                Timestamp("20130101 09:00:03"),
                Timestamp("20130101 09:00:05"),
                Timestamp("20130101 09:00:06"),
            ],
        )
        df
        df.rolling("2s").sum()

    def test_valid(self):

        df = self.regular

        # not a valid freq
        with pytest.raises(ValueError):
            df.rolling(window="foobar")

        # not a datetimelike index
        with pytest.raises(ValueError):
            df.reset_index().rolling(window="foobar")

        # non-fixed freqs
        for freq in ["2MS", offsets.MonthBegin(2)]:
            with pytest.raises(ValueError):
                df.rolling(window=freq)

        for freq in ["1D", offsets.Day(2), "2ms"]:
            df.rolling(window=freq)

        # non-integer min_periods
        for minp in [1.0, "foo", np.array([1, 2, 3])]:
            with pytest.raises(ValueError):
                df.rolling(window="1D", min_periods=minp)

        # center is not implemented
        with pytest.raises(NotImplementedError):
            df.rolling(window="1D", center=True)

    def test_on(self):

        df = self.regular

        # not a valid column
        with pytest.raises(ValueError):
            df.rolling(window="2s", on="foobar")

        # column is valid
        df = df.copy()
        df["C"] = date_range("20130101", periods=len(df))
        df.rolling(window="2d", on="C").sum()

        # invalid columns
        with pytest.raises(ValueError):
            df.rolling(window="2d", on="B")

        # ok even though on non-selected
        df.rolling(window="2d", on="C").B.sum()

    def test_monotonic_on(self):

        # on/index must be monotonic
        df = DataFrame({
            "A": date_range("20130101", periods=5, freq="s"),
            "B": range(5)
        })

        assert df.A.is_monotonic
        df.rolling("2s", on="A").sum()

        df = df.set_index("A")
        assert df.index.is_monotonic
        df.rolling("2s").sum()

    def test_non_monotonic_on(self):
        # GH 19248
        df = DataFrame({
            "A": date_range("20130101", periods=5, freq="s"),
            "B": range(5)
        })
        df = df.set_index("A")
        non_monotonic_index = df.index.to_list()
        non_monotonic_index[0] = non_monotonic_index[3]
        df.index = non_monotonic_index

        assert not df.index.is_monotonic

        with pytest.raises(ValueError):
            df.rolling("2s").sum()

        df = df.reset_index()
        with pytest.raises(ValueError):
            df.rolling("2s", on="A").sum()

    def test_frame_on(self):

        df = DataFrame({
            "B":
            range(5),
            "C":
            date_range("20130101 09:00:00", periods=5, freq="3s")
        })

        df["A"] = [
            Timestamp("20130101 09:00:00"),
            Timestamp("20130101 09:00:02"),
            Timestamp("20130101 09:00:03"),
            Timestamp("20130101 09:00:05"),
            Timestamp("20130101 09:00:06"),
        ]

        # we are doing simulating using 'on'
        expected = df.set_index("A").rolling("2s").B.sum().reset_index(
            drop=True)

        result = df.rolling("2s", on="A").B.sum()
        tm.assert_series_equal(result, expected)

        # test as a frame
        # we should be ignoring the 'on' as an aggregation column
        # note that the expected is setting, computing, and resetting
        # so the columns need to be switched compared
        # to the actual result where they are ordered as in the
        # original
        expected = (df.set_index("A").rolling("2s")[[
            "B"
        ]].sum().reset_index()[["B", "A"]])

        result = df.rolling("2s", on="A")[["B"]].sum()
        tm.assert_frame_equal(result, expected)

    def test_frame_on2(self):

        # using multiple aggregation columns
        df = DataFrame(
            {
                "A": [0, 1, 2, 3, 4],
                "B": [0, 1, 2, np.nan, 4],
                "C":
                Index([
                    Timestamp("20130101 09:00:00"),
                    Timestamp("20130101 09:00:02"),
                    Timestamp("20130101 09:00:03"),
                    Timestamp("20130101 09:00:05"),
                    Timestamp("20130101 09:00:06"),
                ]),
            },
            columns=["A", "C", "B"],
        )

        expected1 = DataFrame(
            {
                "A": [0.0, 1, 3, 3, 7],
                "B": [0, 1, 3, np.nan, 4],
                "C": df["C"]
            },
            columns=["A", "C", "B"],
        )

        result = df.rolling("2s", on="C").sum()
        expected = expected1
        tm.assert_frame_equal(result, expected)

        expected = Series([0, 1, 3, np.nan, 4], name="B")
        result = df.rolling("2s", on="C").B.sum()
        tm.assert_series_equal(result, expected)

        expected = expected1[["A", "B", "C"]]
        result = df.rolling("2s", on="C")[["A", "B", "C"]].sum()
        tm.assert_frame_equal(result, expected)

    def test_basic_regular(self):

        df = self.regular.copy()

        df.index = date_range("20130101", periods=5, freq="D")
        expected = df.rolling(window=1, min_periods=1).sum()
        result = df.rolling(window="1D").sum()
        tm.assert_frame_equal(result, expected)

        df.index = date_range("20130101", periods=5, freq="2D")
        expected = df.rolling(window=1, min_periods=1).sum()
        result = df.rolling(window="2D", min_periods=1).sum()
        tm.assert_frame_equal(result, expected)

        expected = df.rolling(window=1, min_periods=1).sum()
        result = df.rolling(window="2D", min_periods=1).sum()
        tm.assert_frame_equal(result, expected)

        expected = df.rolling(window=1).sum()
        result = df.rolling(window="2D").sum()
        tm.assert_frame_equal(result, expected)

    def test_min_periods(self):

        # compare for min_periods
        df = self.regular

        # these slightly different
        expected = df.rolling(2, min_periods=1).sum()
        result = df.rolling("2s").sum()
        tm.assert_frame_equal(result, expected)

        expected = df.rolling(2, min_periods=1).sum()
        result = df.rolling("2s", min_periods=1).sum()
        tm.assert_frame_equal(result, expected)

    def test_closed(self):

        # xref GH13965

        df = DataFrame(
            {"A": [1] * 5},
            index=[
                Timestamp("20130101 09:00:01"),
                Timestamp("20130101 09:00:02"),
                Timestamp("20130101 09:00:03"),
                Timestamp("20130101 09:00:04"),
                Timestamp("20130101 09:00:06"),
            ],
        )

        # closed must be 'right', 'left', 'both', 'neither'
        with pytest.raises(ValueError):
            self.regular.rolling(window="2s", closed="blabla")

        expected = df.copy()
        expected["A"] = [1.0, 2, 2, 2, 1]
        result = df.rolling("2s", closed="right").sum()
        tm.assert_frame_equal(result, expected)

        # default should be 'right'
        result = df.rolling("2s").sum()
        tm.assert_frame_equal(result, expected)

        expected = df.copy()
        expected["A"] = [1.0, 2, 3, 3, 2]
        result = df.rolling("2s", closed="both").sum()
        tm.assert_frame_equal(result, expected)

        expected = df.copy()
        expected["A"] = [np.nan, 1.0, 2, 2, 1]
        result = df.rolling("2s", closed="left").sum()
        tm.assert_frame_equal(result, expected)

        expected = df.copy()
        expected["A"] = [np.nan, 1.0, 1, 1, np.nan]
        result = df.rolling("2s", closed="neither").sum()
        tm.assert_frame_equal(result, expected)

    def test_ragged_sum(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 3, 3, 7]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=2).sum()
        expected = df.copy()
        expected["B"] = [np.nan, np.nan, 3, np.nan, 7]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="3s", min_periods=1).sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 3, 5, 7]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="3s").sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 3, 5, 7]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="4s", min_periods=1).sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 3, 6, 9]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="4s", min_periods=3).sum()
        expected = df.copy()
        expected["B"] = [np.nan, np.nan, 3, 6, 9]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).sum()
        expected = df.copy()
        expected["B"] = [0.0, 1, 3, 6, 10]
        tm.assert_frame_equal(result, expected)

    def test_ragged_mean(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).mean()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).mean()
        expected = df.copy()
        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
        tm.assert_frame_equal(result, expected)

    def test_ragged_median(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).median()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).median()
        expected = df.copy()
        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
        tm.assert_frame_equal(result, expected)

    def test_ragged_quantile(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).quantile(0.5)
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).quantile(0.5)
        expected = df.copy()
        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
        tm.assert_frame_equal(result, expected)

    def test_ragged_std(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).std(ddof=0)
        expected = df.copy()
        expected["B"] = [0.0] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="1s", min_periods=1).std(ddof=1)
        expected = df.copy()
        expected["B"] = [np.nan] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="3s", min_periods=1).std(ddof=0)
        expected = df.copy()
        expected["B"] = [0.0] + [0.5] * 4
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).std(ddof=1)
        expected = df.copy()
        expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994]
        tm.assert_frame_equal(result, expected)

    def test_ragged_var(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).var(ddof=0)
        expected = df.copy()
        expected["B"] = [0.0] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="1s", min_periods=1).var(ddof=1)
        expected = df.copy()
        expected["B"] = [np.nan] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="3s", min_periods=1).var(ddof=0)
        expected = df.copy()
        expected["B"] = [0.0] + [0.25] * 4
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).var(ddof=1)
        expected = df.copy()
        expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0]
        tm.assert_frame_equal(result, expected)

    def test_ragged_skew(self):

        df = self.ragged
        result = df.rolling(window="3s", min_periods=1).skew()
        expected = df.copy()
        expected["B"] = [np.nan] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).skew()
        expected = df.copy()
        expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0]
        tm.assert_frame_equal(result, expected)

    def test_ragged_kurt(self):

        df = self.ragged
        result = df.rolling(window="3s", min_periods=1).kurt()
        expected = df.copy()
        expected["B"] = [np.nan] * 5
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).kurt()
        expected = df.copy()
        expected["B"] = [np.nan] * 4 + [-1.2]
        tm.assert_frame_equal(result, expected)

    def test_ragged_count(self):

        df = self.ragged
        result = df.rolling(window="1s", min_periods=1).count()
        expected = df.copy()
        expected["B"] = [1.0, 1, 1, 1, 1]
        tm.assert_frame_equal(result, expected)

        df = self.ragged
        result = df.rolling(window="1s").count()
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).count()
        expected = df.copy()
        expected["B"] = [1.0, 1, 2, 1, 2]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=2).count()
        expected = df.copy()
        expected["B"] = [np.nan, np.nan, 2, np.nan, 2]
        tm.assert_frame_equal(result, expected)

    def test_regular_min(self):

        df = DataFrame({
            "A": date_range("20130101", periods=5, freq="s"),
            "B": [0.0, 1, 2, 3, 4]
        }).set_index("A")
        result = df.rolling("1s").min()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        df = DataFrame({
            "A": date_range("20130101", periods=5, freq="s"),
            "B": [5, 4, 3, 4, 5]
        }).set_index("A")

        tm.assert_frame_equal(result, expected)
        result = df.rolling("2s").min()
        expected = df.copy()
        expected["B"] = [5.0, 4, 3, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling("5s").min()
        expected = df.copy()
        expected["B"] = [5.0, 4, 3, 3, 3]
        tm.assert_frame_equal(result, expected)

    def test_ragged_min(self):

        df = self.ragged

        result = df.rolling(window="1s", min_periods=1).min()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).min()
        expected = df.copy()
        expected["B"] = [0.0, 1, 1, 3, 3]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).min()
        expected = df.copy()
        expected["B"] = [0.0, 0, 0, 1, 1]
        tm.assert_frame_equal(result, expected)

    def test_perf_min(self):

        N = 10000

        dfp = DataFrame({"B": np.random.randn(N)},
                        index=date_range("20130101", periods=N, freq="s"))
        expected = dfp.rolling(2, min_periods=1).min()
        result = dfp.rolling("2s").min()
        assert ((result - expected) < 0.01).all().bool()

        expected = dfp.rolling(200, min_periods=1).min()
        result = dfp.rolling("200s").min()
        assert ((result - expected) < 0.01).all().bool()

    def test_ragged_max(self):

        df = self.ragged

        result = df.rolling(window="1s", min_periods=1).max()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).max()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).max()
        expected = df.copy()
        expected["B"] = [0.0, 1, 2, 3, 4]
        tm.assert_frame_equal(result, expected)

    def test_ragged_apply(self, raw):

        df = self.ragged

        f = lambda x: 1
        result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw)
        expected = df.copy()
        expected["B"] = 1.0
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw)
        expected = df.copy()
        expected["B"] = 1.0
        tm.assert_frame_equal(result, expected)

        result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw)
        expected = df.copy()
        expected["B"] = 1.0
        tm.assert_frame_equal(result, expected)

    def test_all(self):

        # simple comparison of integer vs time-based windowing
        df = self.regular * 2
        er = df.rolling(window=1)
        r = df.rolling(window="1s")

        for f in [
                "sum",
                "mean",
                "count",
                "median",
                "std",
                "var",
                "kurt",
                "skew",
                "min",
                "max",
        ]:

            result = getattr(r, f)()
            expected = getattr(er, f)()
            tm.assert_frame_equal(result, expected)

        result = r.quantile(0.5)
        expected = er.quantile(0.5)
        tm.assert_frame_equal(result, expected)

    def test_all_apply(self, raw):

        df = self.regular * 2
        er = df.rolling(window=1)
        r = df.rolling(window="1s")

        result = r.apply(lambda x: 1, raw=raw)
        expected = er.apply(lambda x: 1, raw=raw)
        tm.assert_frame_equal(result, expected)

    def test_all2(self):

        # more sophisticated comparison of integer vs.
        # time-based windowing
        df = DataFrame({"B": np.arange(50)},
                       index=date_range("20130101", periods=50, freq="H"))
        # in-range data
        dft = df.between_time("09:00", "16:00")

        r = dft.rolling(window="5H")

        for f in [
                "sum",
                "mean",
                "count",
                "median",
                "std",
                "var",
                "kurt",
                "skew",
                "min",
                "max",
        ]:

            result = getattr(r, f)()

            # we need to roll the days separately
            # to compare with a time-based roll
            # finally groupby-apply will return a multi-index
            # so we need to drop the day
            def agg_by_day(x):
                x = x.between_time("09:00", "16:00")
                return getattr(x.rolling(5, min_periods=1), f)()

            expected = (df.groupby(df.index.day).apply(agg_by_day).reset_index(
                level=0, drop=True))

            tm.assert_frame_equal(result, expected)

    def test_groupby_monotonic(self):

        # GH 15130
        # we don't need to validate monotonicity when grouping

        data = [
            ["David", "1/1/2015", 100],
            ["David", "1/5/2015", 500],
            ["David", "5/30/2015", 50],
            ["David", "7/25/2015", 50],
            ["Ryan", "1/4/2014", 100],
            ["Ryan", "1/19/2015", 500],
            ["Ryan", "3/31/2016", 50],
            ["Joe", "7/1/2015", 100],
            ["Joe", "9/9/2015", 500],
            ["Joe", "10/15/2015", 50],
        ]

        df = DataFrame(data=data, columns=["name", "date", "amount"])
        df["date"] = to_datetime(df["date"])

        expected = (df.set_index("date").groupby("name").apply(
            lambda x: x.rolling("180D")["amount"].sum()))
        result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
        tm.assert_series_equal(result, expected)

    def test_non_monotonic(self):
        # GH 13966 (similar to #15130, closed by #15175)

        dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
        df = DataFrame({
            "A": [1] * 20 + [2] * 12 + [3] * 8,
            "B": np.concatenate((dates, dates)),
            "C": np.arange(40),
        })

        result = df.groupby("A").rolling("4s", on="B").C.mean()
        expected = (df.set_index("B").groupby("A").apply(
            lambda x: x.rolling("4s")["C"].mean()))
        tm.assert_series_equal(result, expected)

        df2 = df.sort_values("B")
        result = df2.groupby("A").rolling("4s", on="B").C.mean()
        tm.assert_series_equal(result, expected)

    def test_rolling_cov_offset(self):
        # GH16058

        idx = date_range("2017-01-01", periods=24, freq="1h")
        ss = Series(np.arange(len(idx)), index=idx)

        result = ss.rolling("2h").cov()
        expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx)
        tm.assert_series_equal(result, expected)

        expected2 = ss.rolling(2, min_periods=1).cov()
        tm.assert_series_equal(result, expected2)

        result = ss.rolling("3h").cov()
        expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx)
        tm.assert_series_equal(result, expected)

        expected2 = ss.rolling(3, min_periods=1).cov()
        tm.assert_series_equal(result, expected2)

    def test_rolling_on_decreasing_index(self):
        # GH-19248
        index = [
            Timestamp("20190101 09:00:00"),
            Timestamp("20190101 09:00:02"),
            Timestamp("20190101 09:00:03"),
            Timestamp("20190101 09:00:05"),
            Timestamp("20190101 09:00:06"),
        ]

        df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index))
        result = df.rolling("2s").min()
        expected = DataFrame({"column": [3.0, 3.0, 3.0, 2.0, 1.0]},
                             index=reversed(index))
        tm.assert_frame_equal(result, expected)

    def test_rolling_on_multi_index_level(self):
        # GH-15584
        df = DataFrame(
            {"column": range(6)},
            index=MultiIndex.from_product(
                [date_range("20190101", periods=3),
                 range(2)],
                names=["date", "seq"]),
        )
        result = df.rolling("10d", on=df.index.get_level_values("date")).sum()
        expected = DataFrame({"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]},
                             index=df.index)
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 41
0
 def initialize_predictor(self, ratings: pd.DataFrame):
     self.all_movies_AVG = ratings["rating"].mean()
     self.ratings_data = ratings.copy()
     self.ratings_data['rating'] -= self.all_movies_AVG
     self.user_means = self.ratings_data.groupby('user')['rating'].mean()
     self.movie_means = self.ratings_data.groupby('item')['rating'].mean()
Ejemplo n.º 42
0
    def from_proj(cls,
                  proj_path: str,
                  dataframe: pd.DataFrame,
                  sub_dataframe_name: str = 'root',
                  dataframe_filter_history: dict = None):
        """
        :param proj_path: root directory of the project
        :param dataframe: Chosen Child DataFrame from the Mesmerize Project
        :param sub_dataframe_name: Name of the sub DataFrame to load
        :param dataframe_filter_history: Filter history of the child dataframe

        """
        df = dataframe.copy()
        logger.info('Collecting image metadata')
        tqdm().pandas()
        df[['meta', 'stim_maps']] = df.progress_apply(
            lambda r: Transmission._load_files(proj_path, r), axis=1)

        Transmission._load_imginfo.cache_clear()

        try:
            logger.info('Collecting curve data')
            df['_RAW_CURVE'] = df['ROI_State'].progress_apply(
                lambda r: r['curve_data'][1])
        except:
            raise IndexError(
                "Curve data missing from one of your samples.\n"
                "See the progress bar to get the row index of "
                "the project dataframe where the curve data are missing")

        try:
            logger.info('Collecting curve data')
            df['_SPIKES'] = df['ROI_State'].progress_apply(lambda r: r[
                'spike_data'][1] if (r['spike_data'] is not None) else None)

        except KeyError:
            warn(
                'spikes or data not found, probably is probably from Mesmerize version < 0.2'
            )

        try:
            logger.info('Collecting dfof data')
            df['_DFOF'] = df['ROI_State'].progress_apply(lambda r: r[
                'dfof_data'][1] if (r['dfof_data'] is not None) else None)
        except KeyError:
            warn(
                'dfof data not found, probably is probably from Mesmerize version < 0.2'
            )

        df.sort_values(by=['SampleID'], inplace=True)
        df = df.reset_index(drop=True)

        h = HistoryTrace()
        df, block_id = h.create_data_block(df)

        params = {
            'sub_dataframe_name': sub_dataframe_name,
            'dataframe_filter_history': dataframe_filter_history
        }
        h.add_operation(data_block_id=block_id,
                        operation='spawn_transmission',
                        parameters=params)

        proj_config = get_proj_config(proj_path)

        try:
            roi_type_defs = proj_config.options('ROI_DEFS')
            stim_type_defs = proj_config.options('STIM_DEFS')
            custom_columns = proj_config.options('CUSTOM_COLUMNS')
        except:
            raise ValueError(
                'Could not read project configuration when creating Transmission'
                '\n' + traceback.format_exc())

        return cls(df,
                   proj_path=proj_path,
                   history_trace=h,
                   last_output=None,
                   last_unit='time',
                   ROI_DEFS=roi_type_defs,
                   STIM_DEFS=stim_type_defs,
                   CUSTOM_COLUMNS=custom_columns)
Ejemplo n.º 43
0
def export2loom(ex_mtx: pd.DataFrame, regulons: List[Regulon], cell_annotations: Mapping[str,str],
                out_fname: str, num_cores=cpu_count()):
    """
    Create a loom file for a single cell experiment to be used in SCope.

    :param ex_mtx: The expression matrix (n_cells x n_genes).
    :param regulons: A list of Regulons.
    :param cell_annotations: A dictionary that maps a cell ID to its corresponding cell type annotation.
    :param out_fname: The name of the file to create.
    :param num_cores: The number of cores to use for AUCell regulon enrichment.
    """
    # Information on the general loom file format: http://linnarssonlab.org/loompy/format/index.html
    # Information on the SCope specific alterations: https://github.com/aertslab/SCope/wiki/Data-Format

    # TODO: Not mandatory but adding a section "regulonThresholds" to the general metadata would give
    # TODO: additional information to the SCope tool to preset a threshold on the AUC distribution of a regulon
    # TODO: across cells and help with binarization, i.e. deciding if the regulon is "on" or "off" in a cell.

    # Calculate regulon enrichment per cell using AUCell.
    auc_mtx = aucell(ex_mtx, regulons, num_cores=num_cores) # (n_cells x n_regulons)

    # Create an embedding based on UMAP (similar to tSNE but faster).
    umap_embedding_mtx = pd.DataFrame(data=UMAP().fit_transform(auc_mtx),
                                      index=ex_mtx.index, columns=['UMAP1', 'UMAP2']) # (n_cells, 2)

    # Calculate the number of genes per cell.
    binary_mtx = ex_mtx.copy()
    binary_mtx[binary_mtx != 0] = 1.0
    ngenes = binary_mtx.sum(axis=1).astype(int)

    # Encode genes in regulons as "binary" membership matrix.
    genes = np.array(ex_mtx.columns)
    n_genes = len(genes)
    n_regulons = len(regulons)
    data = np.zeros(shape=(n_genes, n_regulons), dtype=int)
    for idx, regulon in enumerate(regulons):
        data[:, idx] = np.isin(genes, regulon.genes).astype(int)
    regulon_assignment = pd.DataFrame(data=data,
                                      index=ex_mtx.columns,
                                      columns=list(map(attrgetter('name'), regulons)))

    # Encode cell type clusters.
    name2idx = dict(map(reversed, enumerate(sorted(set(cell_annotations.values())))))
    clusterings = pd.DataFrame(data=ex_mtx.index,
                               index=ex_mtx.index,
                               columns=['Cell Type']).replace(cell_annotations).replace(name2idx)

    # Create meta-data structure.
    def create_structure_array(df):
        # Create a numpy structured array
        return np.array([tuple(row) for row in df.as_matrix()],
                        dtype=np.dtype(list(zip(df.columns, df.dtypes))))


    nomenclatures = set(map(attrgetter('nomenclature'), regulons))
    assert len(nomenclatures) == 1

    title = os.path.splitext(os.path.basename(out_fname))[0]

    column_attrs = {
        "CellID": ex_mtx.index.values.astype('str'),
        "nGene": ngenes.values,
        "Embedding": create_structure_array(umap_embedding_mtx),
        "RegulonsAUC": create_structure_array(auc_mtx),
        "Clusterings": create_structure_array(clusterings),
        "ClusterID": clusterings.values
        }
    row_attrs = {
        "Gene": ex_mtx.columns.values.astype('str'),
        "Regulons": create_structure_array(regulon_assignment),
        }
    general_attrs = {
        "title": title,
        "MetaData": json.dumps({
            "embeddings": [{
                "id": 0,
                "name": "UMAP (default)",
            }],
            "annotations": [{
                "name": "",
                "values": []
            }],
            "clusterings": [{
                "id": 0,
                "group": "celltype",
                "name": "Cell Type",
                "clusters": [{"id": idx, "description": name} for name, idx in name2idx.items()]
            }]}),
        "Genome": next(iter(nomenclatures))}

    # Create loom file for use with the SCope tool.
    # The loom file format opted for rows as genes to facilitate growth along the column axis (i.e add more cells)
    # PySCENIC chose a different orientation because of limitation set by the feather format: selectively reading
    # information from disk can only be achieved via column selection. For the ranking databases this is of utmost
    # importance.
    fh = lp.create(filename=out_fname,
              matrix=ex_mtx.T.values,
              row_attrs=row_attrs,
              col_attrs=column_attrs,
              file_attrs=general_attrs)
    fh.close()
Ejemplo n.º 44
0
def to_sql(
    df: pd.DataFrame,
    table_name: str,
    creds: SqlCreds,
    sql_type: str = "table",
    schema: str = "dbo",
    index: bool = True,
    if_exists: str = "fail",
    batch_size: int = None,
    debug: bool = False,
    bcp_path: str = None,
):
    """
    Writes the pandas DataFrame to a SQL table or view.

    Will write all columns to the table or view. If the destination table/view doesn't exist, will create it.
    Assumes the SQL table/view has the same number, name, and type of columns.
    To only write parts of the DataFrame, filter it beforehand and pass that to this function.
    Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen.

    Parameters
    ----------
    df : pandas.DataFrame
    table_name : str
        Name of SQL table or view, without the schema
    creds : bcpandas.SqlCreds
        The credentials used in the SQL database.
    sql_type : {'table'}, can only be 'table'
        The type of SQL object of the destination.
    schema : str, default 'dbo'
        The SQL schema.
    index : bool, default True
        Write DataFrame index as a column. Uses the index name as the column
        name in the table.
    if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the table already exists.
        * fail: Raise a BCPandasValueError.
        * replace: Drop the table before inserting new values.
        * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name.
            If the database table exists then the dataframe cannot have new columns that aren't in the table, 
            but conversely table columns can be missing from the dataframe.
    batch_size : int, optional
        Rows will be written in batches of this size at a time. By default, BCP sets this to 1000.
    debug : bool, default False
        If True, will not delete the temporary CSV and format files, and will output their location.
    bcp_path : str, default None
        The full path to the BCP utility, useful if it is not in the PATH environment variable
    """
    # validation
    if df.shape[0] == 0 or df.shape[1] == 0:
        return
    assert sql_type == TABLE, "only supporting table, not view, for now"
    assert if_exists in IF_EXISTS_OPTIONS

    if df.columns.has_duplicates:
        raise BCPandasValueError(
            "Columns with duplicate names detected, SQL requires that column names be unique. "
            f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}")

    # TODO diff way to implement? could be big performance hit with big dataframe
    if index:
        df = df.copy(deep=True).reset_index()

    delim = get_delimiter(df)
    quotechar = get_quotechar(df)

    if batch_size is not None:
        if batch_size == 0:
            raise BCPandasValueError("Param batch_size can't be 0")
        if batch_size > df.shape[0]:
            raise BCPandasValueError(
                "Param batch_size can't be larger than the number of rows in the DataFrame"
            )

    # save to temp path
    csv_file_path = get_temp_file()
    # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server
    df.replace({
        True: 1,
        False: 0
    }).to_csv(
        path_or_buf=csv_file_path,
        sep=delim,
        header=False,
        index=False,  # already set as new col earlier if index=True
        quoting=csv.QUOTE_MINIMAL,  # pandas default
        quotechar=quotechar,
        line_terminator=NEWLINE,
        doublequote=True,
        escapechar=None,  # not needed, as using doublequote
    )
    logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}")

    # build format file
    fmt_file_path = get_temp_file()

    sql_item_exists = _sql_item_exists(sql_type=sql_type,
                                       schema=schema,
                                       table_name=table_name,
                                       creds=creds)
    cols_dict = None  # for mypy
    if if_exists == "append":
        # get dict of column names -> order of column
        cols_dict = dict(
            pd.read_sql_query(
                """
                SELECT COLUMN_NAME, ORDINAL_POSITION 
                FROM INFORMATION_SCHEMA.COLUMNS 
                WHERE TABLE_SCHEMA = '{_schema}'
                AND TABLE_NAME = '{_tbl}'
            """.format(_schema=schema, _tbl=table_name),
                creds.engine,
            ).values)

        # check that column names match in db and dataframe exactly
        if sql_item_exists:
            # the db cols are always strings, unlike df cols
            extra_cols = [
                str(x) for x in df.columns if str(x) not in cols_dict.keys()
            ]
            if extra_cols:
                raise BCPandasValueError(
                    f"Column(s) detected in the dataframe that are not in the database, "
                    f"cannot have new columns if `if_exists=='append'`, "
                    f"the extra column(s): {extra_cols}")

    fmt_file_txt = build_format_file(df=df,
                                     delimiter=delim,
                                     db_cols_order=cols_dict)
    with open(fmt_file_path, "w") as ff:
        ff.write(fmt_file_txt)
    logger.debug(f"Created BCP format file at {fmt_file_path}")

    try:
        if if_exists == "fail":
            if sql_item_exists:
                raise BCPandasValueError(
                    f"The {sql_type} called {schema}.{table_name} already exists, "
                    f"`if_exists` param was set to `fail`.")
            else:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)
        elif if_exists == "replace":
            _create_table(schema=schema,
                          table_name=table_name,
                          creds=creds,
                          df=df,
                          if_exists=if_exists)
        elif if_exists == "append":
            if not sql_item_exists:
                _create_table(schema=schema,
                              table_name=table_name,
                              creds=creds,
                              df=df,
                              if_exists=if_exists)

        # BCP the data in
        bcp(
            sql_item=table_name,
            direction=IN,
            flat_file=csv_file_path,
            format_file_path=fmt_file_path,
            creds=creds,
            sql_type=sql_type,
            schema=schema,
            batch_size=batch_size,
            bcp_path=bcp_path,
        )
    finally:
        if not debug:
            logger.debug(f"Deleting temp CSV and format files")
            os.remove(csv_file_path)
            os.remove(fmt_file_path)
        else:
            logger.debug(
                f"`to_sql` DEBUG mode, not deleting the files. CSV file is at "
                f"{csv_file_path}, format file is at {fmt_file_path}")
    def filter(d: pd.DataFrame, ns: SimpleNamespace) -> pd.DataFrame:

        if not hasattr(ns, "prefiltering"):
            return d
        ns = ns.prefiltering

        strategy = getattr(ns, "strategy", None)
        data = d.copy()
        if strategy == "global_threshold":
            threshold = getattr(ns, "threshold", None)
            if threshold is not None:
                if str(threshold).isdigit():
                    data = PreFilter.filter_ratings_by_threshold(data, threshold)
                elif threshold == "average":
                    data = PreFilter.filter_ratings_by_global_average(data)
                else:
                    raise Exception("Threshold value not recognized")
            else:
                raise Exception("Threshold option is missing")

        elif strategy == "user_average":
            data = PreFilter.filter_ratings_by_user_average(data)

        elif strategy == "user_k_core":
            core = getattr(ns, "core", None)
            if core is not None:
                if str(core).isdigit():
                    data = PreFilter.filter_users_by_profile_size(data, core)
                else:
                    raise Exception("Core option is not a digit")
            else:
                raise Exception("Core option is missing")

        elif strategy == "item_k_core":
            core = getattr(ns, "core", None)
            if core is not None:
                if str(core).isdigit():
                    data = PreFilter.filter_items_by_popularity(data, core)
                else:
                    raise Exception("Core option is not a digit")
            else:
                raise Exception("Core option is missing")

        elif strategy == "iterative_k_core":
            core = getattr(ns, "core", None)
            if core is not None:
                if str(core).isdigit():
                    data = PreFilter.filter_iterative_k_core(data, core)
                else:
                    raise Exception("Core option is not a digit")
            else:
                raise Exception("Core option is missing")

        elif strategy == "n_rounds_k_core":
            core = getattr(ns, "core", None)
            n_rounds = getattr(ns, "rounds", None)
            if (core is not None) and (n_rounds is not None):
                if str(core).isdigit() and str(n_rounds).isdigit():
                    data = PreFilter.filter_rounds_k_core(data, core, n_rounds)
                else:
                    raise Exception("Core or rounds options are not digits")
            else:
                raise Exception("Core or rounds options are missing")

        elif strategy == "cold_users":
            threshold = getattr(ns, "threshold", None)
            if threshold is not None:
                if str(threshold).isdigit():
                    data = PreFilter.filter_retain_cold_users(data, threshold)
                else:
                    raise Exception("Threshold option is not a digit")
            else:
                raise Exception("Threshold option is missing")

        else:
            raise Exception("Misssing strategy")

        return data
Ejemplo n.º 46
0
    def fit(self, data: pd.DataFrame):
        """Create a dataframe with the results of the Granger causality test with the specified
        statistical test(s).

        Parameters
        ----------
        data : pd.DataFrame, shape (n_samples, n_time_series), required
            The dataframe containing the time series.

        Returns
        -------
        self : object
            Returns the instance itself.

        """

        shifts = data.copy()
        x_columns, y_columns = [], []
        for i in range(1, self.max_shift + 1):
            shifts[f"x_shift_{i}"] = data[self.target_col].shift(i)
            shifts[f"y_shift_{i-1}"] = data[self.x_col].shift(i)
            x_columns.append(f"x_shift_{i}")
            y_columns.append(f"y_shift_{i-1}")
        shifts.drop([self.target_col, self.x_col],
                    axis="columns",
                    inplace=True)
        shifts = shifts.dropna()

        data_single = shifts[x_columns].copy()
        data_joint = shifts[x_columns + y_columns].copy()

        linreg_single = LinearRegression()
        linreg_joint = LinearRegression()
        linreg_single.fit(data_single, data[self.x_col].loc[data_single.index])
        linreg_joint.fit(data_joint, data[self.x_col].loc[data_joint.index])
        if "likelihood_chi2" in self.statistics or "zero_f" in self.statistics:
            y_pred_single = linreg_single.predict(data_single)
            y_pred_joint = linreg_joint.predict(data_joint)
        else:
            y_pred_single = None
            y_pred_joint = None

        # dof_single = float(data_single.shape[0] - data_single.shape[1])
        dof_joint = float(data_joint.shape[0] - data_joint.shape[1]) - 1

        linreg_single_residues = linreg_single._residues
        linreg_joint_residues = linreg_joint._residues

        self.results_ = []

        stat_test_input = {
            "linreg_single_residues": linreg_single_residues,
            "linreg_joint_residues": linreg_joint_residues,
            "dof_joint": dof_joint,
            "max_shift": self.max_shift,
            "data_single": data_single,
            "y_pred_single": y_pred_single,
            "y_pred_joint": y_pred_joint,
            "data": data,
            "x_col": self.x_col,
            "data_joint": data_joint,
            "linreg_joint": linreg_joint,
        }

        for s in self.statistics:
            self.results_.append(STAT_TESTS[s](stat_test_input))

        return self
Ejemplo n.º 47
0
    def test_multi_assign(self):

        # GH 3626, an assignment of a sub-df to a df
        df = DataFrame({
            "FC": ["a", "b", "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": list(range(6)),
            "col2": list(range(6, 12)),
        })
        df.iloc[1, 0] = np.nan
        df2 = df.copy()

        mask = ~df2.FC.isna()
        cols = ["col1", "col2"]

        dft = df2 * 2
        dft.iloc[3, 3] = np.nan

        expected = DataFrame({
            "FC": ["a", np.nan, "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": Series([0, 1, 4, 6, 8, 10]),
            "col2": [12, 7, 16, np.nan, 20, 22],
        })

        # frame on rhs
        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        # with an ndarray on rhs
        # coerces to float64 because values has float64 dtype
        # GH 14001
        expected = DataFrame({
            "FC": ["a", np.nan, "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0],
            "col2": [12, 7, 16, np.nan, 20, 22],
        })
        df2 = df.copy()
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)

        # broadcasting on the rhs is required
        df = DataFrame(
            dict(
                A=[1, 2, 0, 0, 0],
                B=[0, 0, 0, 10, 11],
                C=[0, 0, 0, 10, 11],
                D=[3, 4, 5, 6, 7],
            ))

        expected = df.copy()
        mask = expected["A"] == 0
        for col in ["A", "B"]:
            expected.loc[mask, col] = df["D"]

        df.loc[df["A"] == 0, ["A", "B"]] = df["D"]
        tm.assert_frame_equal(df, expected)
Ejemplo n.º 48
0
def preprocess_states(states_df: pd.DataFrame) -> pd.DataFrame:
    """
        This function applies the preprocessing steps necessary to move from the raw
        observation to a spatial representation.

        The spatial representation is like this:
            - plan 0: Tile type (hexlayout)
            - plan 1: Tile number
            - plan 2: Robber position
            - plan 3: Game phase id
            - plan 4: Development card left
            - plan 5: Last dice result
            - plan 6: Starting player id
            - plan 7: Current player id
            - plan 8: Current player has played a developement card during its turn
            3 type of pieces, 6 way to put it around the hex
            - plan 9-26: Player 1 pieces
            - plan 27-44: Player 2 pieces
            - plan 45-62: Player 3 pieces
            - plan 63-80: Player 4 pieces
            see java_utils.parse_player_infos for more information
            - plan 81-121: Player 1 public info
            - plan 122-162: Player 2 public info
            - plan 163-203: Player 3 public info
            - plan 204-244: Player 4 public info

        State shape: 245x7x7
    """
    states_df = states_df.copy()
    del states_df['touchingnumbers']
    del states_df['name']
    del states_df['id']

    states_df['gameturn'] = states_df['gameturn'].apply(ju.get_replicated_plan) \
                                                 .apply(normalize_gameturn)

    states_df['hexlayout'] = states_df['hexlayout'].apply(ju.parse_layout) \
                                                   .apply(ju.mapping_1d_2d) \
                                                   .apply(normalize_hexlayout)

    states_df['numberlayout'] = states_df['numberlayout'].apply(ju.parse_layout) \
                                                         .apply(ju.mapping_1d_2d) \
                                                         .apply(normalize_numberlayout)

    states_df['robberhex'] = states_df['robberhex'].apply(ju.get_1d_id_from_hex) \
                                                   .apply(ju.get_2d_id) \
                                                   .apply(ju.get_one_hot_plan)

    states_df['piecesonboard'] = states_df['piecesonboard'].apply(
        ju.parse_pieces)

    states_df['gamestate'] = states_df['gamestate'].apply(ju.parse_game_phases)

    states_df['devcardsleft'] = states_df['devcardsleft'].apply(
        ju.parse_devcardsleft)

    states_df['diceresult'] = states_df['diceresult'].apply(
        ju.parse_dice_result)

    states_df['startingplayer'] = states_df['startingplayer'].apply(
        ju.parse_starting_player)

    states_df['currentplayer'] = states_df['currentplayer'].apply(
        ju.parse_current_player)

    states_df['playeddevcard'] = states_df['playeddevcard'].apply(
        ju.get_replicated_plan)

    states_df['playersresources'] = states_df['playersresources'].apply(ju.parse_player_resources) \
                                                                 .apply(normalize_playersresources)

    states_df['players'] = states_df['players'].apply(ju.parse_player_infos)

    return states_df
Ejemplo n.º 49
0
def splitFramesAndForecast(frame: pd.DataFrame, options: dict) -> pd.DataFrame:
    """
    Automates forecasting for each subframe defined by the sortColumns options
    parameter. Will return the forecast using the method with the best MAPE for
    each subframe. Will also print the MAPEs of each method for each forecast
    made.

    Parameters
    ----------
    frame : pd.DataFrame
        Data needed for forecast
    options : dict
        Instructions on how to read 'frame'

    Returns
    -------
    outputFrame : pd.DataFrame
        Same as original 'frame' but with all the columns associated with a
        forecast added.

    """
    #creates a list of frames, each of which will correspond to a different
    #forecast
    frame.sort_values(by=params.getParam('sortColumns', options),
                      ascending=True,
                      inplace=True)

    frames = list(frame.groupby(by=params.getParam('splitColumns', options)))

    outputFrame = None

    for frame in frames:
        frame = frame[1]
        frame.reset_index(drop=True, inplace=True)

        method = params.getParam('method', options)

        #specifies actions if the forecast method is set to "Auto" in
        #the options dictionary
        if method == 'Auto':
            opts = options.copy()
            opts['method'] = 'ARIMA'
            arimaFrame = forecastSingleFrame(frame.copy(), opts)
            arimaMAPE = 1E6 if 'X_MAPE' not in arimaFrame else arimaFrame[
                'X_MAPE'][0]

            opts = options.copy()
            opts['method'] = 'Prophet'
            prophetFrame = forecastSingleFrame(frame.copy(), opts)
            prophetMAPE = 1E6 if 'X_MAPE' not in prophetFrame else prophetFrame[
                'X_MAPE'][0]

            opts = options.copy()
            opts['method'] = 'MLR'
            mlrFrame = forecastSingleFrame(frame.copy(), opts)
            mlrMAPE = 1E6 if 'X_MAPE' not in mlrFrame else mlrFrame['X_MAPE'][0]

            if 'X_FORECAST' in mlrFrame and 'X_FORECAST' in prophetFrame and 'X_FORECAST' in arimaFrame:
                ensembleFrame = mlrFrame.copy()

                # we calculate MAPE using original data column
                targetColumn = params.getParam('targetColumn', options)
                if (targetColumn.startswith('X_')):
                    targetColumn = targetColumn[2:]

                # split the data into past/future based on null in target column
                numHoldoutRows = params.getParam('numHoldoutRows', options)
                lastNonNullIdx = Forecast().lastNonNullIndex(
                    ensembleFrame[targetColumn])
                lastNonNullIdx = lastNonNullIdx - numHoldoutRows

                if (numHoldoutRows > 0):
                    evalIdx = list(
                        map(
                            lambda x: x > lastNonNullIdx and x <=
                            (lastNonNullIdx + numHoldoutRows),
                            ensembleFrame['X_INDEX']))
                else:
                    evalIdx = ensembleFrame['X_INDEX'] <= lastNonNullIdx

                ensembleFrame['X_FORECAST'] = list(
                    map(lambda x, y, z: median([x, y, z]),
                        mlrFrame['X_FORECAST'], arimaFrame['X_FORECAST'],
                        prophetFrame['X_FORECAST']))
                ensembleFrame['X_LPI'] = list(
                    map(lambda x, y, z: median([x, y, z]), mlrFrame['X_LPI'],
                        arimaFrame['X_LPI'], prophetFrame['X_LPI']))
                ensembleFrame['X_UPI'] = list(
                    map(lambda x, y, z: median([x, y, z]), mlrFrame['X_UPI'],
                        arimaFrame['X_UPI'], prophetFrame['X_UPI']))

                evalFrame = ensembleFrame[evalIdx]
                try:
                    ensembleMAPE = calcMAPE(evalFrame['X_FORECAST'],
                                            evalFrame[targetColumn])
                    ensembleFrame['X_MAPE'] = ensembleMAPE
                    for index, row in ensembleFrame.iterrows():
                        ensembleFrame['X_APE'][index] = (
                            abs(row['X_FORECAST'] - row[targetColumn]) /
                            row[targetColumn] *
                            100.0) if row[targetColumn] != 0 else None
                except:
                    # this may be needed if all forecasts frame and MAPE, APE cannot be calculated
                    if (not ('X_MAPE' in ensembleFrame)):
                        ensembleFrame['X_MAPE'] = 1E6
                    if (not ('X_APE' in ensembleFrame)):
                        ensembleFrame['X_APE'] = 1E6

                mapes = [mlrMAPE, arimaMAPE, prophetMAPE, ensembleMAPE]
            else:
                mapes = [mlrMAPE, arimaMAPE, prophetMAPE]

            print("Auto MAPEs (MLR, ARIMA, Prophet, Ensemble): ", mapes)

            minMAPE = min(mapes)

            if (mlrMAPE <= minMAPE):
                frame = mlrFrame
                frame['X_METHOD'] = 'MLR'
            elif (prophetMAPE <= minMAPE):
                frame = prophetFrame
                frame['X_METHOD'] = 'Prophet'
            elif (arimaMAPE <= minMAPE):
                frame = arimaFrame
                frame['X_METHOD'] = 'ARIMA'
            else:
                frame = ensembleFrame
                frame['X_METHOD'] = 'Ensemble'

        else:
            frame = forecastSingleFrame(frame, options.copy())

        outputFrame = frame if outputFrame is None else outputFrame.append(
            frame, ignore_index=True)

    return outputFrame
Ejemplo n.º 50
0
    def balance(self, df: pd.DataFrame, target: str):
        '''
            The balance function.
        :param df: pd.DataFrame
            The pandas Data Frame to apply the balancer.
        :param target: str
            The name of the target column.
        :return: pd.DataFrame
            A pandas Data Frame
        '''

        # Creating an internal copy of the data frame.
        self.df = df.copy()
        self.target = target

        # Checking if the target string based t algorithm is present in the data frame.
        if target not in self.df.columns:
            raise NoSuchColumn(f"{target} isn't a column of passed data frame")

        # Checking if the target column is a binary one.
        if len(self.df[target].unique()) != 2:
            raise NotBinaryData(f"{target} column isn't a binary column")

        # Getting the column names that are not the target one.
        self.X_columns = [
            column for column in self.df.columns if column != target
        ]

        # Getting the class frequencies.
        classes_frequency = dict(self.df[target].value_counts())

        # Searching for the class with the biggest frequency.
        max_freq = 0
        for cls in classes_frequency:
            if classes_frequency[cls] > max_freq:
                majority_class = cls
                max_freq = classes_frequency[cls]

        # Getting the name of the minority class.
        minority_class = [
            cls for cls in classes_frequency if cls != majority_class
        ][0]

        # Getting the total number of minority samples to generate.
        G = int((classes_frequency[majority_class] -
                 classes_frequency[minority_class]) * self.__beta)

        # Getting the set of the minority samples.
        minority_samples = self.df[self.df[target] == minority_class][
            self.X_columns].values

        # Generating the r matrix - the k indexes of the nearest neighbours.
        r = np.array([])
        self.neighbourhood = []
        for minority_sample in minority_samples:
            predicted_indexes = self.__predict_knn(minority_sample)
            r = np.append(
                r,
                len(self.df[(self.df.index.isin(predicted_indexes) &
                             (self.df[self.target] == majority_class))]) /
                self.__k)
            self.neighbourhood.append(predicted_indexes)

        # Normalizing the r array
        r = r / np.sum(r)

        # Calculating the amount of synthetic examples to generate per neighbourhood.
        G = r * G

        # Generating the synthetic data.
        self.synthetic_data = []
        for i in range(len(G)):
            for _ in range(floor(G[i])):
                choices = self.df.iloc[self.neighbourhood[i], :][self.df[
                    self.target] == minority_class][self.X_columns].values
                if len(choices) < 2:
                    continue
                choices = choices[np.random.randint(len(choices), size=2)]
                s = choices[0] + (choices[1] - choices[0]) * random.uniform(
                    0, 1)
                self.synthetic_data.append(s)

        # Replacing infinity values with minimal and maximal float python values.
        self.synthetic_data = self.__infinity_check(
            np.array(self.synthetic_data).astype(float))

        # Creating the synthetic data frame
        self.synthetic_df = pd.DataFrame(np.array(self.synthetic_data),
                                         columns=self.X_columns)

        # Rounding binary columns if needed.
        if self.__binarize:
            self.__to_binary()

        # Adding the target column
        self.synthetic_df.loc[:, self.target] = minority_class
        new_df = pd.concat([self.df, self.synthetic_df], axis=0)
        return new_df
Ejemplo n.º 51
0
def fill_fields_and_timeseries_from_column(
    log: BoundLogger,
    existing_df: pd.DataFrame,
    new_df: pd.DataFrame,
    index_fields: List[str],
    date_field: str,
    column_to_fill: str,
) -> pd.DataFrame:
    """
    Return a copy of existing_df with column column_to_fill populated from new_df. Values in existing_df are copied
    to the return value except for column_to_fill of rows with index_fields present in new_df.

    If the data frames represent timeseries than pass the name of the time column in date_field. This will clear
    'column_to_fill' for all times for each index_fields in new_df. This prevents the return value containing
    timeseries with a blend of values from existing_df and new_df.

    See examples in dataset_utils_test.py

    Args:
        log: a bound structlog logger.
        existing_df: Existing data frame
        new_df: Data used to fill existing df columns
        index_fields: List of columns to use as common index.
        date_field: the time column name if the data frames represent timeseries, otherwise ''
        column_to_fill: column to add into existing_df from data_source

    Returns: Updated DataFrame with requested column filled from data_source data.
    """
    # Here is a nice tutorial on indexing:
    # https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html

    # Copy so this code can work on the data inplace without modifying the inputs.
    existing_df = existing_df.copy()
    new_df = new_df.copy()
    if column_to_fill not in existing_df.columns:
        existing_df[column_to_fill] = None

    if date_field:
        _clear_common_values(log, existing_df, new_df, index_fields,
                             column_to_fill)
        # From here down treat the date as part of the index label for joining rows of existing_df and new_df
        index_fields.append(date_field)

    new_df.set_index(index_fields, inplace=True)
    if not existing_df.empty:
        existing_df.set_index(index_fields, inplace=True)
        common_labels = existing_df.index.intersection(new_df.index)
    else:
        # Treat an empty existing_df the same as one that has no rows in common with new_df
        common_labels = []

    if len(common_labels):
        # existing_df is not empty and contains labels in common with new_df. When date_field is set the date is
        # included in the compared labels and dates that are not in exsiting_df are appended later.

        # Sort suggested by 'PerformanceWarning: indexing past lexsort depth may impact performance'
        # common_labels is a sparse subset of all labels in both DataFrame and the values are looked up
        # one by one.
        existing_df.sort_index(inplace=True, sort_remaining=True)
        new_df.sort_index(inplace=True, sort_remaining=True)

        # TODO(tombrown): I have a hunch that this is mostly copying NaN values. Check and consider optimizing by
        # ignoring rows without a real value in column_to_fill.
        existing_df.loc[common_labels.values,
                        column_to_fill] = new_df.loc[common_labels.values,
                                                     column_to_fill]
        diff = new_df.index.difference(common_labels)

        # If there are no missing fields, simply return existing dataframe (by this point all fields
        # have been merged in).
        if not diff.size:
            return existing_df.reset_index()

        missing_new_data = new_df.loc[diff, [column_to_fill]]
    else:
        # There are no labels in common so all rows of new_df are to be appended to existing_df.
        missing_new_data = new_df.loc[:, [column_to_fill]]

    # Revert 'fips', 'state' etc back to regular columns
    existing_df.reset_index(inplace=True)
    missing_new_data.reset_index(inplace=True)
    # Concat the existing data with new rows from new_data, creating a new integer index
    return pd.concat([existing_df, missing_new_data], ignore_index=True)
Ejemplo n.º 52
0
    def test_compute_predictions_3(self, tmp_path):
        # Test with an historical predictions delta > 1
        # This means that historical predictions are not computed starting from initial index 1-step ahead at time,
        # but they are computed every $delta time points.
        ing_data = DataFrame({
            "a": pandas.date_range('2000-01-01', periods=30),
            "b": np.arange(30, 60),
            "c": np.arange(60, 90)
        })
        ing_data.set_index("a", inplace=True)
        ing_data = add_freq(ing_data, "D")

        param_config = {
            "input_parameters": {},
            "model_parameters": {
                "test_values": 2,
                "delta_training_percentage": 100,
                "prediction_lags": 10,
                "possible_transformations": "none",
                "models": "fbprophet,mockup",
                "main_accuracy_estimator": "mae",
            },
            "historical_prediction_parameters": {
                "initial_index": "2000-01-20",
                "save_path": os.path.join(tmp_path, "test3.pkl"),
                "delta": 3
            }
        }

        timeseries_containers = compute_historical_predictions(
            ingested_data=ing_data, param_config=param_config)

        assert len(timeseries_containers) == 2
        assert timeseries_containers[0].timeseries_data.columns[0] == "b"
        assert timeseries_containers[1].timeseries_data.columns[0] == "c"

        assert len(timeseries_containers[0].models) == 2
        assert len(timeseries_containers[1].models) == 2

        for s in timeseries_containers:
            scen_name = s.timeseries_data.columns[0]
            for model in s.historical_prediction:
                hist_prediction = s.historical_prediction[model]
                assert len(hist_prediction) == 10
                id = 0
                for i in pandas.date_range('2000-01-21', periods=10):
                    assert hist_prediction.index[id] == i
                    id += 1

            for endpoint in [
                    *pandas.date_range('2000-01-20', periods=4, freq="3d")
            ]:
                tr = ing_data.copy()
                fb_tr = tr.loc[:endpoint]
                fb_tr = fb_tr[[scen_name]]
                fbmodel = Prophet()
                fb_tr.reset_index(inplace=True)
                fb_tr.columns = ['ds', 'y']

                with suppress_stdout_stderr():
                    fbmodel.fit(fb_tr)

                future_df = pd.DataFrame(index=pd.date_range(
                    freq="1d",
                    start=endpoint + pandas.Timedelta(days=1),
                    periods=3),
                                         columns=["yhat"])
                future = future_df.reset_index()
                future.rename(columns={'index': 'ds'}, inplace=True)
                forecast = fbmodel.predict(future)
                forecast.set_index('ds', inplace=True)
                expected_hist_pred = forecast.loc[:, 'yhat']
                expected_hist_pred = expected_hist_pred.astype(object)
                expected_hist_pred.rename(scen_name, inplace=True)
                if endpoint == pd.Timestamp(
                        '2000-01-29 00:00:00'
                ):  # Last point, remove last 2 points
                    expected_hist_pred = expected_hist_pred.iloc[0:1]

                computed_hist_pred = s.historical_prediction['fbprophet'].loc[
                    endpoint + pandas.Timedelta(days=1):endpoint +
                    pandas.Timedelta(days=3), scen_name]

                assert expected_hist_pred.equals(computed_hist_pred)
AND GL_ACCOUNT_NUMBER NOT IN (\'114000\', \'113000\', \'119800\')\
AND ACCOUNTING_DOCUMENT_TYPE IN (\'ZR\',\'DA\',\'DR\',\'ZM\',\'DG\',\'DZ\',\'RC\',\
\'RD\',\'RV\',\'DS\', \'KN\')\
AND (CLEARING_DOCUMENT_NUMBER IS NOT NULL \
AND CLEARING_DOCUMENT_NUMBER <> \' \'  AND CLEARING_DOCUMENT_NUMBER <> \'\')\
AND (REVERSAL_INDICATOR IS NULL OR REVERSAL_INDICATOR = \'\' OR REVERSAL_INDICATOR = \' \') \
ORDER BY "AMOUNT_IN_GROUP_CONSOLIDATED"\
) as inv \
LEFT JOIN "_SYS_BIC"."HCDW.IT.SHARED/D_CUSTOMER" as cust \
ON inv.CUSTOMER_NUMBER = cust.CUSTOMER_NUMBER \
LEFT JOIN "_SYS_BIC"."HCDW.IT.SHARED/D_ENTERPRISE_DATE" as d on inv.POSTING_DATE = d.full_date_trimmed \
ORDER BY "POSTING_DATE" ')
df = DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
##df.to_csv(r'C:\Users\cthieme\OneDrive - Micron Technology, Inc\Test folder\test_ar_data_export.csv', index = False)
hana_data_2019 = df.copy()

#Pulling 2018 Data from Hana #######################################
connection = pyhdb.connect(host="xx.xxx.com",
                           port=xxx,
                           user="******",
                           password="******")
cursor = connection.cursor()
cursor.execute('SELECT inv."CUSTOMER_NUMBER", cust."CUSTOMER_NAME_1",\
cust."ACCOUNT_GROUP",\
cust."COUNTRY",\
inv."FISCAL_QUARTER",\
inv."FISCAL_PERIOD",\
d."FISCAL_DAY_OF_QUARTER_NUMBER",\
d."FISCAL_DAY_OF_PERIOD_NUMBER",\
d."WORK_WEEK",\
Ejemplo n.º 54
0
    def test_unstack_nan_index(self):  # GH7466
        cast = lambda val: "{0:1}".format("" if val != val else val)

        def verify(df):
            mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
            rows, cols = df.notna().values.nonzero()
            for i, j in zip(rows, cols):
                left = sorted(df.iloc[i, j].split("."))
                right = mk_list(df.index[i]) + mk_list(df.columns[j])
                right = sorted(list(map(cast, right)))
                assert left == right

        df = DataFrame({
            "jim": ["a", "b", np.nan, "d"],
            "joe": ["w", "x", "y", "z"],
            "jolie": ["a.w", "b.x", " .y", "d.z"],
        })

        left = df.set_index(["jim", "joe"]).unstack()["jolie"]
        right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
        tm.assert_frame_equal(left, right)

        for idx in itertools.permutations(df.columns[:2]):
            mi = df.set_index(list(idx))
            for lev in range(2):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == len(df)
                verify(udf["jolie"])

        df = DataFrame({
            "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 +
            ["e"] * 2 + ["b"] * 5,
            "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 +
            [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2,
            "3rd": [
                67,
                39,
                53,
                72,
                57,
                80,
                31,
                18,
                11,
                30,
                59,
                50,
                62,
                59,
                76,
                52,
                14,
                53,
                60,
                51,
            ],
        })

        df["4th"], df["5th"] = (
            df.apply(lambda r: ".".join(map(cast, r)), axis=1),
            df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
        )

        for idx in itertools.permutations(["1st", "2nd", "3rd"]):
            mi = df.set_index(list(idx))
            for lev in range(3):
                udf = mi.unstack(level=lev)
                assert udf.notna().values.sum() == 2 * len(df)
                for col in ["4th", "5th"]:
                    verify(udf[col])

        # GH7403
        df = pd.DataFrame({
            "A": list("aaaabbbb"),
            "B": range(8),
            "C": range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [
            [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
        ]
        vals = list(map(list, zip(*vals)))
        idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = DataFrame({
            "A": list("aaaabbbb"),
            "B": list(range(4)) * 2,
            "C": range(8)
        })
        df.iloc[2, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        df = pd.DataFrame({
            "A": list("aaaabbbb"),
            "B": list(range(4)) * 2,
            "C": range(8)
        })
        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack(0)

        vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
        cols = MultiIndex(levels=[["C"], ["a", "b"]],
                          codes=[[0, 0], [0, 1]],
                          names=[None, "A"])
        idx = Index([np.nan, 0, 1, 2, 3], name="B")
        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH7401
        df = pd.DataFrame({
            "A":
            list("aaaaabbbbb"),
            "B": (date_range("2012-01-01", periods=5).tolist() * 2),
            "C":
            np.arange(10),
        })

        df.iloc[3, 1] = np.NaN
        left = df.set_index(["A", "B"]).unstack()

        vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
        idx = Index(["a", "b"], name="A")
        cols = MultiIndex(
            levels=[["C"], date_range("2012-01-01", periods=5)],
            codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
            names=[None, "B"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        # GH4862
        vals = [
            ["Hg", np.nan, np.nan, 680585148],
            ["U", 0.0, np.nan, 680585148],
            ["Pb", 7.07e-06, np.nan, 680585148],
            ["Sn", 2.3614e-05, 0.0133, 680607017],
            ["Ag", 0.0, 0.0133, 680607017],
            ["Hg", -0.00015, 0.0133, 680607017],
        ]
        df = DataFrame(
            vals,
            columns=["agent", "change", "dosage", "s_id"],
            index=[17263, 17264, 17265, 17266, 17267, 17268],
        )

        left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()

        vals = [
            [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
            [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
        ]

        idx = MultiIndex(
            levels=[[680585148, 680607017], [0.0133]],
            codes=[[0, 1], [-1, 0]],
            names=["s_id", "dosage"],
        )

        cols = MultiIndex(
            levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
            codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
            names=[None, "agent"],
        )

        right = DataFrame(vals, columns=cols, index=idx)
        tm.assert_frame_equal(left, right)

        left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
        tm.assert_frame_equal(left.unstack(), right)

        # GH9497 - multiple unstack with nulls
        df = DataFrame({
            "1st": [1, 2, 1, 2, 1, 2],
            "2nd": pd.date_range("2014-02-01", periods=6, freq="D"),
            "jim": 100 + np.arange(6),
            "joe": (np.random.randn(6) * 10).round(2),
        })

        df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
        df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
        df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan

        left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
        assert left.notna().values.sum() == 2 * len(df)

        for col in ["jim", "joe"]:
            for _, r in df.iterrows():
                key = r["1st"], (col, r["2nd"], r["3rd"])
                assert r[col] == left.loc[key]
Ejemplo n.º 55
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:

        print("Constructing captions")
        t0 = time.time()

        X = X.copy()

        def __extract_hashtags(string):

            try:
                hashtags = [
                    re.sub(r"(\W+)$", "", j) for j in set(
                        [i for i in string.split() if i.startswith("#")])
                ]
            except:
                return np.nan
            if len(hashtags) == 0:
                return np.nan
            else:
                return hashtags

        def __extract_pcredits(row):
            string = row['caption']

            try:
                pcredits = [
                    re.sub(r"(\W+)$", "", j) for j in set(
                        [i for i in string.split() if i.startswith("@")])
                ]
                pcredits.append('@' + row['credits'])
            except:
                return np.nan

            return pcredits

        def __hashtagQC(hashtags):

            hashstring = ''.join(hashtags).lower()
            for word in self.hashtagQClist:
                if word in hashstring:
                    return np.nan
            return [e.lower() for e in hashtags if len(e) > 1]

        def __generate_repost_caption(row):

            caption = row['caption']
            date = row['postdate']
            credit = row['credits']
            a = re.split("[.!]+", caption)[0]

            if a[-1] == '?':
                a[-1] == '!'
            else:
                a = a + '!'

            a = re.sub('[@$""' ']', '', a)

            post_date = f"{date.month:02d}-{date.day:02d}-{date.year}"

            comment = f'Here is a segment from the original post, by @{credit} on {post_date}: "{a}"'
            return comment

        X['hashtags'] = X['caption'].apply(__extract_hashtags)
        X['pcredits'] = X.apply(lambda row: __extract_pcredits(row), axis=1)
        X['postdate'] = pd.to_datetime(X['postdate'])

        X.dropna(inplace=True)

        X['hashtags'] = X['hashtags'].apply(__hashtagQC)
        X['repost_comment'] = X.apply(
            lambda row: __generate_repost_caption(row), axis=1)

        X.dropna(inplace=True)

        X.reset_index(inplace=True, drop=True)
        t1 = time.time()

        print(f'Done in {t1-t0} seconds')

        return X
Ejemplo n.º 56
0
def get_cubes(
    p: pd.DataFrame,
    grid: float = 1.0,
) -> pd.DataFrame:
    v = correctlyRotateDataFrame(p.copy())

    v["count"] = 0

    counts = pd.DataFrame(data={
        "x": [],
        "y": [],
        "z": [],
        "r": [],
        "g": [],
        "b": [],
        "c": [],
    })

    global global_min
    global global_maxX
    global global_minY
    global global_maxY
    global_minX = min(v['x'])
    global_maxX = max(v['x'])
    global_minY = min(v['y'])
    global_maxY = max(v['y'])

    for x, y, z in itertools.product(
            np.arange(min(v["x"]), max(v["x"]), grid),
            np.arange(min(v["y"]), max(v["y"]), grid),
            np.arange(min(v["z"]), max(v["z"]), grid),
    ):
        overscan = 0.0
        dots = v[(v["x"] >= x - overscan)
                 & (v["x"] < x + grid + overscan)
                 & (v["y"] >= y - overscan)
                 & (v["y"] < y + grid + overscan)
                 & (v["z"] >= z - overscan)
                 & (v["z"] < z + grid + overscan)]
        count = len(dots)
        counts = counts.append(
            {
                "x": x,
                "y": y,
                "z": z,
                "r": dots["r"].mean(),
                "g": dots["g"].mean(),
                "b": dots["b"].mean(),
                "c": count,
            },
            ignore_index=True,
        )
        #        counts += [count]
        v.loc[(v["x"] >= x)
              & (v["x"] < x + grid)
              & (v["y"] >= y)
              & (v["y"] < y + grid)
              & (v["z"] >= z)
              & (v["z"] < z + grid), "count", ] = count
        # print(f"{x}, {y}: {count}")

    v.loc[v["count"] > 20, "count"] = 20
    v = v[v["count"] > 5]

    return counts
Ejemplo n.º 57
0
def test_clean_input_format(df_countries: pd.DataFrame) -> None:
    df_clean_name = clean_country(df_countries,
                                  "messy_country",
                                  input_format="name")
    df_clean_official = clean_country(df_countries,
                                      "messy_country",
                                      input_format="official")
    df_clean_alpha2 = clean_country(df_countries,
                                    "messy_country",
                                    input_format="alpha-2")
    df_clean_alpha3 = clean_country(df_countries,
                                    "messy_country",
                                    input_format="alpha-3")
    df_clean_numeric = clean_country(df_countries,
                                     "messy_country",
                                     input_format="numeric")

    df_check_name_and_official = df_countries.copy()
    df_check_name_and_official["messy_country_clean"] = [
        "Canada",
        "Canada",
        np.nan,
        np.nan,
        "Ireland",
        "DR Congo",
        "Congo Republic",
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
    ]
    df_check_alpha2 = df_countries.copy()
    df_check_alpha2["messy_country_clean"] = [
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        "American Samoa",
        "Turkey",
        "Belize",
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
    ]
    df_check_alpha3 = df_countries.copy()
    df_check_alpha3["messy_country_clean"] = [
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        "Argentina",
        "Bouvet Island",
        "New Zealand",
        np.nan,
        np.nan,
        np.nan,
    ]
    df_check_numeric = df_countries.copy()
    df_check_numeric["messy_country_clean"] = [
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        "Greenland",
        "Estonia",
        "Yemen",
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
        np.nan,
    ]

    assert df_clean_name.equals(df_check_name_and_official)
    assert df_clean_official.equals(df_check_name_and_official)
    assert df_clean_alpha2.equals(df_check_alpha2)
    assert df_clean_alpha3.equals(df_check_alpha3)
    assert df_clean_numeric.equals(df_check_numeric)
Ejemplo n.º 58
0
    def test_indexing_with_datetime_tz(self):

        # GH#8260
        # support datetime64 with tz

        idx = Index(date_range("20130101", periods=3, tz="US/Eastern"),
                    name="foo")
        dr = date_range("20130110", periods=3)
        df = DataFrame({"A": idx, "B": dr})
        df["C"] = idx
        df.iloc[1, 1] = pd.NaT
        df.iloc[1, 2] = pd.NaT

        # indexing
        result = df.iloc[1]
        expected = Series(
            [
                Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT,
                pd.NaT
            ],
            index=list("ABC"),
            dtype="object",
            name=1,
        )
        tm.assert_series_equal(result, expected)
        result = df.loc[1]
        expected = Series(
            [
                Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), pd.NaT,
                pd.NaT
            ],
            index=list("ABC"),
            dtype="object",
            name=1,
        )
        tm.assert_series_equal(result, expected)

        # indexing - fast_xs
        df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")})
        result = df.iloc[5]
        expected = Series([Timestamp("2014-01-06 00:00:00+0000", tz="UTC")],
                          index=["a"],
                          name=5)
        tm.assert_series_equal(result, expected)

        result = df.loc[5]
        tm.assert_series_equal(result, expected)

        # indexing - boolean
        result = df[df.a > df.a[3]]
        expected = df.iloc[4:]
        tm.assert_frame_equal(result, expected)

        # indexing - setting an element
        df = DataFrame(
            data=pd.to_datetime(["2015-03-30 20:12:32",
                                 "2015-03-12 00:11:11"]),
            columns=["time"],
        )
        df["new_col"] = ["new", "old"]
        df.time = df.set_index("time").index.tz_localize("UTC")
        v = df[df.new_col == "new"].set_index("time").index.tz_convert(
            "US/Pacific")

        # trying to set a single element on a part of a different timezone
        # this converts to object
        df2 = df.copy()
        df2.loc[df2.new_col == "new", "time"] = v

        expected = Series([v[0], df.loc[1, "time"]], name="time")
        tm.assert_series_equal(df2.time, expected)

        v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s")
        df.loc[df.new_col == "new", "time"] = v
        tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v)
Ejemplo n.º 59
0
    def test_frame_iloc_setitem_callable(self):
        # GH#11485
        df = DataFrame({
            "X": [1, 2, 3, 4],
            "Y": list("aabb")
        },
                       index=list("ABCD"))

        # return location
        res = df.copy()
        res.iloc[lambda x: [1, 3]] = 0
        exp = df.copy()
        exp.iloc[[1, 3]] = 0
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[lambda x: [1, 3], :] = -1
        exp = df.copy()
        exp.iloc[[1, 3], :] = -1
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[lambda x: [1, 3], lambda x: 0] = 5
        exp = df.copy()
        exp.iloc[[1, 3], 0] = 5
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[lambda x: [1, 3], lambda x: [0]] = 25
        exp = df.copy()
        exp.iloc[[1, 3], [0]] = 25
        tm.assert_frame_equal(res, exp)

        # mixture
        res = df.copy()
        res.iloc[[1, 3], lambda x: 0] = -3
        exp = df.copy()
        exp.iloc[[1, 3], 0] = -3
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[[1, 3], lambda x: [0]] = -5
        exp = df.copy()
        exp.iloc[[1, 3], [0]] = -5
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[lambda x: [1, 3], 0] = 10
        exp = df.copy()
        exp.iloc[[1, 3], 0] = 10
        tm.assert_frame_equal(res, exp)

        res = df.copy()
        res.iloc[lambda x: [1, 3], [0]] = [-5, -5]
        exp = df.copy()
        exp.iloc[[1, 3], [0]] = [-5, -5]
        tm.assert_frame_equal(res, exp)
Ejemplo n.º 60
0
    def test_equals(self):
        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        s1[1] = 99
        self.assertFalse(s1.equals(s2))

        # NaNs compare as equal
        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        s2[0] = 9.9
        self.assertFalse(s1.equals(s2))

        idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
        s1 = Series([1, 2, np.nan], index=idx)
        s2 = s1.copy()
        self.assertTrue(s1.equals(s2))

        # Add object dtype column with nans
        index = np.random.random(10)
        df1 = DataFrame(np.random.random(10, ),
                        index=index,
                        columns=['floats'])
        df1['text'] = 'the sky is so blue. we could use more chocolate.'.split(
        )
        df1['start'] = date_range('2000-1-1', periods=10, freq='T')
        df1['end'] = date_range('2000-1-1', periods=10, freq='D')
        df1['diff'] = df1['end'] - df1['start']
        df1['bool'] = (np.arange(10) % 3 == 0)
        df1.ix[::2] = nan
        df2 = df1.copy()
        self.assertTrue(df1['text'].equals(df2['text']))
        self.assertTrue(df1['start'].equals(df2['start']))
        self.assertTrue(df1['end'].equals(df2['end']))
        self.assertTrue(df1['diff'].equals(df2['diff']))
        self.assertTrue(df1['bool'].equals(df2['bool']))
        self.assertTrue(df1.equals(df2))
        self.assertFalse(df1.equals(object))

        # different dtype
        different = df1.copy()
        different['floats'] = different['floats'].astype('float32')
        self.assertFalse(df1.equals(different))

        # different index
        different_index = -index
        different = df2.set_index(different_index)
        self.assertFalse(df1.equals(different))

        # different columns
        different = df2.copy()
        different.columns = df2.columns[::-1]
        self.assertFalse(df1.equals(different))

        # DatetimeIndex
        index = pd.date_range('2000-1-1', periods=10, freq='T')
        df1 = df1.set_index(index)
        df2 = df1.copy()
        self.assertTrue(df1.equals(df2))

        # MultiIndex
        df3 = df1.set_index(['text'], append=True)
        df2 = df1.set_index(['text'], append=True)
        self.assertTrue(df3.equals(df2))

        df2 = df1.set_index(['floats'], append=True)
        self.assertFalse(df3.equals(df2))

        # NaN in index
        df3 = df1.set_index(['floats'], append=True)
        df2 = df1.set_index(['floats'], append=True)
        self.assertTrue(df3.equals(df2))