def test_concat_csv_empty_inputs(self, tmpdir): """ test concatenate csv with data-less input csvs :param tmpdir: tempdir to test in """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(0, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir) api.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(concatenated, ref)
def test_concat_csv_different_cols(self, tmpdir, n_rows): """ concat two dataframes with different columns """ dtypes1 = {v: "float" for v in 'ABCD'} dtypes2 = {v: "float" for v in 'ABGF'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes1, dtypes2], write=True, get_ref=True, dir=tmpdir) api.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_multiple_files_to_concat(self, tmpdir, n_rows, n_frames): """ provide just 1 file to concat """ dtypes = [{v: "int" for v in 'ABCD'} for _ in range(n_frames)] concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, dtypes, write=True, get_ref=True, dir=tmpdir) api.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv(self, tmpdir, n_rows): """ basic sanity check - concat two csvs with same cols :param tmpdir: temporary directory to write in :param n_rows: number of rows in test csvs """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir) api.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_one_file_to_concat(self, tmpdir, n_rows): """ provide just 1 file to concat :param tmpdir: temp dir to test in :param n_rows: length of test dfs :return: """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') df, csv, ref = self.base_test_concat(n_rows, [dtypes], write=True, get_ref=True, dir=tmpdir) api.concatenate_csv(csv, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_with_nans(self, tmpdir, n_rows): """ concat two csvs with NaNs """ dtypes = {v: "float" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs = self.make_test_dfs([dtypes, dtypes], n_rows) csvs = [ os.path.join(tmpdir, "0.csv.gz"), os.path.join(tmpdir, "1.csv.gz") ] dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN api.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes) api.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes) ref = pd.concat(dfs, ignore_index=True) api.concatenate_csv(csvs, concatenated) assert self.dfs_exact_match(ref, concatenated)
def test_concat_csv_no_header(self, tmpdir, n_rows): """ test concating csvs with no headers :param tmpdir: temporary directory to write in :param n_rows: number of rows in test csvs """ dtypes = {v: "int" for v in 'ABCD'} concatenated = os.path.join(tmpdir, 'concat.csv.gz') dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True, get_ref=True, dir=tmpdir, write_head=False) api.concatenate_csv(csvs, concatenated, write_header=False) assert self.dfs_exact_match(ref, concatenated) concatenated = pd.read_csv(concatenated) # ignore separate yaml assert all([ col not in concatenated.columns.tolist() for col in dtypes.keys() ])
def concat(in_f, out_f, write_header, drop_duplicates): api.concatenate_csv(list(in_f), out_f, write_header, drop_duplicates)