Esempio n. 1
0
    def test_concat_csv_empty_inputs(self, tmpdir):
        """
        test concatenate csv with data-less input csvs
        :param tmpdir: tempdir to test in
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(0, [dtypes, dtypes],
                                               write=True,
                                               get_ref=True,
                                               dir=tmpdir)

        api.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(concatenated, ref)
Esempio n. 2
0
    def test_concat_csv_different_cols(self, tmpdir, n_rows):
        """
        concat two dataframes with different columns
        """
        dtypes1 = {v: "float" for v in 'ABCD'}
        dtypes2 = {v: "float" for v in 'ABGF'}

        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes1, dtypes2],
                                               write=True,
                                               get_ref=True,
                                               dir=tmpdir)

        api.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Esempio n. 3
0
    def test_concat_csv_multiple_files_to_concat(self, tmpdir, n_rows,
                                                 n_frames):
        """
        provide just 1 file to concat
        """
        dtypes = [{v: "int" for v in 'ABCD'} for _ in range(n_frames)]
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows,
                                               dtypes,
                                               write=True,
                                               get_ref=True,
                                               dir=tmpdir)

        api.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Esempio n. 4
0
    def test_concat_csv(self, tmpdir, n_rows):
        """
        basic sanity check - concat two csvs with same cols
        :param tmpdir: temporary directory to write in
        :param n_rows: number of rows in test csvs
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes],
                                               write=True,
                                               get_ref=True,
                                               dir=tmpdir)

        api.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Esempio n. 5
0
    def test_concat_csv_one_file_to_concat(self, tmpdir, n_rows):
        """
        provide just 1 file to concat
        :param tmpdir: temp dir to test in
        :param n_rows: length of test dfs
        :return:
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        df, csv, ref = self.base_test_concat(n_rows, [dtypes],
                                             write=True,
                                             get_ref=True,
                                             dir=tmpdir)

        api.concatenate_csv(csv, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Esempio n. 6
0
    def test_concat_csv_with_nans(self, tmpdir, n_rows):
        """
        concat two csvs with NaNs
        """
        dtypes = {v: "float" for v in 'ABCD'}

        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs = self.make_test_dfs([dtypes, dtypes], n_rows)
        csvs = [
            os.path.join(tmpdir, "0.csv.gz"),
            os.path.join(tmpdir, "1.csv.gz")
        ]

        dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN
        dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN

        api.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes)
        api.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes)

        ref = pd.concat(dfs, ignore_index=True)
        api.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Esempio n. 7
0
    def test_concat_csv_no_header(self, tmpdir, n_rows):
        """
        test concating csvs with no headers
        :param tmpdir: temporary directory to write in
        :param n_rows: number of rows in test csvs
        """

        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes],
                                               write=True,
                                               get_ref=True,
                                               dir=tmpdir,
                                               write_head=False)
        api.concatenate_csv(csvs, concatenated, write_header=False)

        assert self.dfs_exact_match(ref, concatenated)

        concatenated = pd.read_csv(concatenated)  # ignore separate yaml

        assert all([
            col not in concatenated.columns.tolist() for col in dtypes.keys()
        ])
Esempio n. 8
0
def concat(in_f, out_f, write_header, drop_duplicates):
    api.concatenate_csv(list(in_f), out_f, write_header, drop_duplicates)