def quick_concat(dfs, dimension_columns, partition_columns): """ Fast version of:: pd.concat( dfs, ignore_index=True, sort=False, ).sort_values(dimension_columns + partition_columns).reset_index(drop=True) if inputs are presorted. Parameters ----------- dfs: Iterable[pandas.DataFrame] DataFrames to concat. dimension_columns: Iterable[str] Dimension columns in correct order. partition_columns: Iterable[str] Partition columns in correct order. Returns ------- df: pandas.DataFrame Concatenated result. """ return sort_dataframe( df=concat_dataframes(dfs), columns=list(dimension_columns) + list(partition_columns), )
def test_many(self, dummy_default, maybe_iter): dfs = [ pd.DataFrame(data={ "a": [0, 1], "b": 1.0 }, columns=["a", "b"], index=[10, 11]), pd.DataFrame(data={ "a": [2, 3], "b": 2.0 }, columns=["a", "b"], index=[10, 11]), pd.DataFrame(data={ "a": [4, 5], "b": 3.0 }, columns=["a", "b"]), ] expected = pd.DataFrame( { "a": [0, 1, 2, 3, 4, 5], "b": [1.0, 1.0, 2.0, 2.0, 3.0, 3.0] }, columns=["a", "b"], ) actual = concat_dataframes(maybe_iter(dfs), dummy_default) pdt.assert_frame_equal(actual, expected)
def test_default(self, maybe_iter): df = pd.DataFrame(data={ "a": [0, 1], "b": 1.0 }, columns=["a", "b"], index=[10, 11]) actual = concat_dataframes(maybe_iter([]), df) pdt.assert_frame_equal(actual, df)
def _normalize_user_input(data, cube): if isinstance(data, (dict, pd.DataFrame)): data = [data] else: data = list(data) data_lists = defaultdict(list) for part in data: part = multiplex_user_input(part, cube) for k, v in part.items(): data_lists[k].append(v) return { k: concat_dataframes([df for df in v if df is not None]) for k, v in data_lists.items() }
def _load_all_mps(mps, store, load_columns, predicates, empty): """ Load kartothek_cube-relevant data from all given MetaPartitions. The result will be a concatenated Dataframe. Parameters ---------- mps: Iterable[MetaPartition] MetaPartitions to load. store: simplekv.KeyValueStore Store to load data from. load_columns: List[str] Columns to load. predicates: Optional[List[List[Tuple[str, str, Any]]]] Predicates to apply during load. empty: pandas.DataFrame Empty Dataframe dummy. Returns ------- df: pandas.DataFrame Concatenated data. """ dfs_mp = [] for mp in mps: mp = mp.load_dataframes( store=store, predicate_pushdown_to_io=True, tables=[SINGLE_TABLE], columns={SINGLE_TABLE: sorted(load_columns)}, predicates=predicates, ) df = mp.data[SINGLE_TABLE] df.columns = df.columns.map(converter_str) dfs_mp.append(df) return concat_dataframes(dfs_mp, empty)
def test_fail_no_default(self, maybe_iter): with pytest.raises(ValueError) as exc: concat_dataframes(maybe_iter([]), None) assert str(exc.value) == "Cannot concatenate 0 dataframes."
def test_fail_different_colsets(self, maybe_iter): dfs = [pd.DataFrame({"a": [1]}), pd.DataFrame({"a": [1], "b": [2]})] with pytest.raises( ValueError, match="Not all DataFrames have the same set of columns!"): concat_dataframes(maybe_iter(dfs))
def test_no_columns(self, dfs, expected): actual = concat_dataframes(dfs) pdt.assert_frame_equal(actual, expected)
def test_whipe_list(self, dfs): concat_dataframes(dfs) assert dfs == []