def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame({ 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), }), pd.DataFrame({ 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='float64'), }), ] mismatched_column_names = [ pd.DataFrame({ 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), }), pd.DataFrame({ 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'X': pd.Series([103, 102, 104], dtype='int64'), }), ] with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_dtypes) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.") with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_column_names) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes.")
def test_categorical_df_concat_value_error(self): mismatched_dtypes = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='float64'), } ), ] mismatched_column_names = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'X': pd.Series([103, 102, 104], dtype='int64'), } ), ] with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_dtypes) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes." ) with self.assertRaises(ValueError) as cm: categorical_df_concat(mismatched_column_names) self.assertEqual( str(cm.exception), "Input DataFrames must have the same columns/dtypes." )
def run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize): ranges = compute_date_range_chunks( self._calendar, start_date, end_date, chunksize, ) chunks = [self.run_pipeline(pipeline, s, e) for s, e in ranges] return categorical_df_concat(chunks, inplace=True)
def test_categorical_df_concat(self): inp = [ pd.DataFrame( { 'A': pd.Series(['a', 'b', 'c'], dtype='category'), 'B': pd.Series([100, 102, 103], dtype='int64'), 'C': pd.Series(['x', 'x', 'x'], dtype='category'), } ), pd.DataFrame( { 'A': pd.Series(['c', 'b', 'd'], dtype='category'), 'B': pd.Series([103, 102, 104], dtype='int64'), 'C': pd.Series(['y', 'y', 'y'], dtype='category'), } ), pd.DataFrame( { 'A': pd.Series(['a', 'b', 'd'], dtype='category'), 'B': pd.Series([101, 102, 104], dtype='int64'), 'C': pd.Series(['z', 'z', 'z'], dtype='category'), } ), ] result = categorical_df_concat(inp) expected = pd.DataFrame( { 'A': pd.Series( ['a', 'b', 'c', 'c', 'b', 'd', 'a', 'b', 'd'], dtype='category' ), 'B': pd.Series( [100, 102, 103, 103, 102, 104, 101, 102, 104], dtype='int64' ), 'C': pd.Series( ['x', 'x', 'x', 'y', 'y', 'y', 'z', 'z', 'z'], dtype='category' ), }, ) expected.index = pd.Int64Index([0, 1, 2, 0, 1, 2, 0, 1, 2]) assert_equal(expected, result) assert_equal( expected['A'].cat.categories, result['A'].cat.categories ) assert_equal( expected['C'].cat.categories, result['C'].cat.categories )