def test_append_concat(self): rng = date_range('5/8/2012 1:45', periods=10, freq='5T') ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) tm.assert_index_equal(result.index, ex_index) tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() rng2 = rng.copy() rng1.name = 'foo' rng2.name = 'bar' assert rng1.append(rng1).name == 'foo' assert rng1.append(rng2).name is None
def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) df = DataFrame({'a': ['a', 'b', 'test & test'], 'b': [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) expec = df[df.a == 'test & test'] assert_frame_equal(res, expec)
def compute_confusion_matrix(target, predicted, normalize=True, sort = True): """ returns a confusion matrix as a data frame with labels Parameters: target (array): The values that are predicted. predicted (array): predicted values. normalize (bool): If True, Normalize normalize (bool): If true sort by value. Returns (DataFrame): df with the confusion matrix. """ # Determine the uniqu values in the target list, sort them and assign as labels. labels = np.unique(list(target)) labels.sort() # Compute the confusion matrix, place into data frame and normailize if desired. confusion = metrics.confusion_matrix(target, predicted, labels) confusion = DataFrame(confusion, index=labels, columns=labels) if normalize: confusion = confusion.apply(lambda x: x / np.sum(x), axis=1) # if sort is true: find the max value for each and then sort, the confusion matrix if sort: #get the max values, order and then use to order the confusion matrix on both axes max_values =confusion.max(axis = 1) max_values.sort(inplace = True, ascending=False) order = max_values.index confusion = confusion.loc[order,order] return confusion
class TestDataFrameEvalWithFrame(object): def setup_method(self, method): self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc')) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): res = self.frame.eval('a + b', engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) @pytest.mark.parametrize('op', ['+', '-', '*', '/']) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser)
def test_nested_scope(self): engine = self.engine parser = self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! groups = grouped.groups assert isinstance(list(groups.keys())[0], datetime) # GH#11442 index = pd.date_range('2015/01/01', periods=5, name='date') df = pd.DataFrame({'A': [5, 6, 7, 8, 9], 'B': [1, 2, 3, 4, 5]}, index=index) result = df.groupby(level='date').groups dates = ['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02', '2015-01-01'] expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') for date in dates} tm.assert_dict_equal(result, expected) grouped = df.groupby(level='date') for date in dates: result = grouped.get_group(date) data = [[df.loc[date, 'A'], df.loc[date, 'B']]] expected_index = pd.DatetimeIndex([date], name='date') expected = pd.DataFrame(data, columns=list('AB'), index=expected_index) tm.assert_frame_equal(result, expected)
def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) expected = df.groupby('A')['A'].apply(lambda x: x.max()) result = df.groupby('A')['A'].max() assert_series_equal(result, expected)
def test_to_string_format_na(self): fmt.reset_printoptions() df = DataFrame({'A' : [np.nan, -1, -2.1234, 3, 4], 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) result = df.to_string() expected = (' A B\n' '0 NaN NaN\n' '1 -1.0000 foo\n' '2 -2.1234 foooo\n' '3 3.0000 fooooo\n' '4 4.0000 bar') self.assertEqual(result, expected) df = DataFrame({'A' : [np.nan, -1., -2., 3., 4.], 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) result = df.to_string() expected = (' A B\n' '0 NaN NaN\n' '1 -1 foo\n' '2 -2 foooo\n' '3 3 fooooo\n' '4 4 bar') self.assertEqual(result, expected)
def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] data = np.ones((5, 4), int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) if observed: # if we are not-observed we undergo a reindex # so need to adjust the output as our expected sets us up # to be non-observed expected_columns = CategoricalIndex(['A', 'B'], categories=categories, ordered=True) else: expected_columns = CategoricalIndex(categories, categories=categories, ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected)
def test_to_string_repr_unicode(self): buf = StringIO() unicode_values = [u'\u03c3'] * 10 unicode_values = np.array(unicode_values, dtype=object) df = DataFrame({'unicode' : unicode_values}) df.to_string(col_space=10, buf=buf) # it works! repr(df) idx = Index(['abc', u'\u03c3a', 'aegdvg']) ser = Series(np.random.randn(len(idx)), idx) rs = repr(ser).split('\n') line_len = len(rs[0]) for line in rs[1:]: try: line = line.decode('utf-8') except: pass self.assert_(len(line) == line_len) # it works even if sys.stdin in None sys.stdin = None repr(df) sys.stdin = sys.__stdin__
def test_to_string_with_formatters_unicode(self): df = DataFrame({u'c/\u03c3':[1,2,3]}) result = df.to_string(formatters={u'c/\u03c3': lambda x: '%s' % x}) self.assertEqual(result, (u' c/\u03c3\n' '0 1\n' '1 2\n' '2 3'))
def test_eng_float_formatter(self): df = DataFrame({'A' : [1.41, 141., 14100, 1410000.]}) fmt.set_eng_float_format() result = df.to_string() expected = (' A\n' '0 1.410E+00\n' '1 141.000E+00\n' '2 14.100E+03\n' '3 1.410E+06') self.assertEqual(result, expected) fmt.set_eng_float_format(use_eng_prefix=True) result = df.to_string() expected = (' A\n' '0 1.410\n' '1 141.000\n' '2 14.100k\n' '3 1.410M') self.assertEqual(result, expected) fmt.set_eng_float_format(accuracy=0) result = df.to_string() expected = (' A\n' '0 1E+00\n' '1 141E+00\n' '2 14E+03\n' '3 1E+06') self.assertEqual(result, expected) fmt.reset_printoptions()
def test_resample_anchored_intraday(self): # #1471, #1458 rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) result = df.resample('M') expected = df.resample('M', kind='period').to_timestamp(how='end') tm.assert_frame_equal(result, expected) result = df.resample('M', closed='left') exp = df.tshift(1, freq='D').resample('M', kind='period') exp = exp.to_timestamp(how='end') tm.assert_frame_equal(result, exp) rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) result = df.resample('Q') expected = df.resample('Q', kind='period').to_timestamp(how='end') tm.assert_frame_equal(result, expected) result = df.resample('Q', closed='left') expected = df.tshift(1, freq='D').resample('Q', kind='period', closed='left') expected = expected.to_timestamp(how='end') tm.assert_frame_equal(result, expected) ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') resampled = ts.resample('M') self.assert_(len(resampled) == 1)
def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) expected = df.iloc[[0, 2]].set_index('A') g = df.groupby('A') result1 = g.head(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.tail(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) g = df.groupby('A') result1 = g.nth(0) result2 = g.head(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df) g = df.groupby('A') result1 = g.nth(0) result2 = g.tail(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df)
def test_coercion_with_loc(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: start_dataframe = DataFrame({'foo': start_data}) start_dataframe.loc[0, ['foo']] = None expected_dataframe = DataFrame({'foo': expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe)
def test_loc_setitem_frame_multiples(self): # multiple setting df = DataFrame({'A': ['foo', 'bar', 'baz'], 'B': Series( range(3), dtype=np.int64)}) rhs = df.loc[1:2] rhs.index = df.index[0:2] df.loc[0:1] = rhs expected = DataFrame({'A': ['bar', 'baz', 'baz'], 'B': Series( [1, 2, 2], dtype=np.int64)}) tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), 'val': Series( range(5), dtype=np.int64)}) expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( '20000102'), Timestamp('20000101'), Timestamp('20000102'), Timestamp('20000103')], 'val': Series( [0, 1, 0, 1, 2], dtype=np.int64)}) rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs tm.assert_frame_equal(df, expected)
def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') expected = ( df.groupby('user_id')['whole_cost'] .resample(freq) .sum(min_count=1) # XXX .dropna() .reorder_levels(['date', 'user_id']) .sort_index() .astype('int64') ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected)
def test_frame_reset_index(self): dr = date_range("2012-06-02", periods=10, tz=self.tzstr("US/Eastern")) df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index("index") xp = df.index.tz rs = roundtripped.index.tz self.assertEqual(xp, rs)
def test_frame_datetime64_handling_groupby(self): # it works! df = DataFrame([(3, np.datetime64('2012-07-03')), (3, np.datetime64('2012-07-04'))], columns=['a', 'date']) result = df.groupby('a').first() assert result['date'][3] == Timestamp('2012-07-03')
def test_frame_reset_index(self): dr = date_range('2012-06-02', periods=10, tz='US/Eastern') df = DataFrame(np.random.randn(len(dr)), dr) roundtripped = df.reset_index().set_index('index') xp = df.index.tz rs = roundtripped.index.tz self.assertEquals(xp, rs)
def test_stable_descending_sort(self): # GH #6399 df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], columns=['sort_col', 'order']) sorted_df = df.sort_values(by='sort_col', kind='mergesort', ascending=False) assert_frame_equal(df, sorted_df)
def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') ts = Series(np.random.randn(len(rng)), index=rng) ts_utc = ts.tz_localize('utc') self.assertRaises(Exception, ts.__add__, ts_utc) self.assertRaises(Exception, ts_utc.__add__, ts) test1 = DataFrame(np.zeros((6,3)), index=date_range("2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central")) test2 = DataFrame(np.zeros((3,3)), index=date_range("2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"), columns=range(3,6)) result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) self.assertTrue(result.index.equals(ex_index)) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) self.assertTrue(result.tz.zone == 'UTC')
def test_query_with_partially_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) a = np.random.choice(['red', 'green'], size=10) b = np.arange(10) index = MultiIndex.from_arrays([a, b]) index.names = [None, 'rating'] df = DataFrame(np.random.randn(10, 2), index=index) res = df.query('rating == 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind == 1] assert_frame_equal(res, exp) res = df.query('rating != 1', parser=parser, engine=engine) ind = Series(df.index.get_level_values('rating').values, index=index, name='rating') exp = df[ind != 1] assert_frame_equal(res, exp) res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind == "red"] assert_frame_equal(res, exp) res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) ind = Series(df.index.get_level_values(0).values, index=index) exp = df[ind != "red"] assert_frame_equal(res, exp)
def test_parse_dates_noconvert_thousands(self): # see gh-14066 data = 'a\n04.15.2016' expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) result = self.read_csv(StringIO(data), parse_dates=['a'], thousands='.') tm.assert_frame_equal(result, expected) exp_index = DatetimeIndex(['2016-04-15'], name='a') expected = DataFrame(index=exp_index) result = self.read_csv(StringIO(data), index_col=0, parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected) data = 'a,b\n04.15.2016,09.16.2013' expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], thousands='.') tm.assert_frame_equal(result, expected) expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) expected = expected.set_index(['a', 'b']) result = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected)
def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test x = 1 # noqa result = pd.eval('x + 1', engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser with pytest.raises(SyntaxError): df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): df.query('(df>0) & (df2>0)', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) assert_frame_equal(expected, result)
def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), index=date_range("1/1/2000", periods=100)) monthly_group = df.groupby(lambda x: (x.year, x.month)) result = monthly_group.mean() assert isinstance(result.index[0], tuple)
def test_query_single_element_booleans(self, parser, engine): columns = 'bid', 'bidsize', 'ask', 'asksize' data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) res = df.query('bid & ask', engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected)
def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) df = DataFrame({'a': a, 'b': b, 'c': c}) result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), margins=True) self.assertEqual(result.index.names, ('a',)) self.assertEqual(result.columns.names, ['b', 'c']) all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') exp_cols = exp_cols.append(Series([len(df)], index=['All'])) tm.assert_series_equal(all_cols, exp_cols) all_rows = result.ix['All'] exp_rows = df.groupby(['b', 'c']).size().astype('i8') exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows)
def components(self): """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. Returns ------- a DataFrame """ from pandas import DataFrame columns = ['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds', 'nanoseconds'] hasnans = self.hasnans if hasnans: def f(x): if isnull(x): return [np.nan] * len(columns) return x.components else: def f(x): return x.components result = DataFrame([f(x) for x in self]) result.columns = columns if not hasnans: result = result.astype('int64') return result
def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) ax = df.plot() # verify tick labels ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs) # change xlim ax.set_xlim('1:30', '5:00') # check tick labels again ticks = ax.get_xticks() labels = ax.get_xticklabels() for t, l in zip(ticks, labels): m, s = divmod(int(t), 60) h, m = divmod(m, 60) xp = l.get_text() if len(xp) > 0: rs = time(h, m, s).strftime('%H:%M:%S') self.assertEqual(xp, rs)