Example #1
0
def merge_csv(in_filenames: Union[List[str], Dict[str, str]],
              out_filename: str,
              how: str,
              on: List[str],
              write_header: bool = True) -> None:
    """
    Create one gzipped CSV out of multiple gzipped CSVs.

    @param in_filenames: Dictionary containing file paths as keys
    @param out_filename: Path to newly merged CSV
    @param how: How to join DataFrames (inner, outer, left, right).
    @param on: Column(s) to join on, comma separated if multiple.
    @param write_header: boolean, True = write header, False = don't write header
    @return:
    """
    if isinstance(in_filenames, dict):
        in_filenames = list(in_filenames.values())

    data: List[CsverveInput] = [
        CsverveInput(infile) for infile in in_filenames
    ]

    dfs: List[str] = [csvinput.read_csv() for csvinput in data]

    dtypes: List[Dict[str, str]] = [csvinput.dtypes for csvinput in data]

    merged_data: pd.DataFrame = utils.merge_frames(dfs, how, on)

    dtypes_: Dict[str, str] = utils.merge_dtypes(dtypes)

    csvoutput: CsverveOutputDataFrame = CsverveOutputDataFrame(
        merged_data, out_filename, dtypes_, write_header=write_header)
    csvoutput.write_df()
Example #2
0
    def test_merge_frames_multiple_cols(self, n_rows):
        """
        test merging of 2 dfs on multiple columns with right merge
        """

        how = "inner"
        on = ["A", "B"]
        suffs = ["", ""]
        dtypes1 = {v: "int" for v in "ABC"}
        dtypes2 = {v: "int" for v in "ABDF"}
        dtypes = [dtypes1, dtypes2]

        dfs, ref = self.base_merge_test(n_rows, how, on, suffs, dtypes)

        merged = utils.merge_frames(dfs, how=how, on=on)

        assert self.dfs_exact_match(ref, merged)
Example #3
0
    def merge_frames_directional_test(self, length, direction):
        """
        merge frames in a given direction; corresponds to "how"
        :param length: length of test dfs
        :param direction: direction to merge in (outter, inner etc.)
        """
        how = direction
        on = ["A"]
        suffs = ["", ""]
        dtypes1 = {v: "int" for v in "ABC"}
        dtypes2 = {v: "int" for v in "ADF"}
        dtypes = [dtypes1, dtypes2]

        dfs, ref = self.base_merge_test(length, how, on, suffs, dtypes)

        merged = utils.merge_frames(dfs, how=how, on=on)

        assert self.dfs_exact_match(ref, merged)
Example #4
0
    def test_merge_frames_with_nans(self, n_rows):
        """
        test merging of 2 dfs on 1 col which contains NaNs in each
        """
        dtypes1 = {v: "float" for v in 'ACD'}
        dtypes2 = {v: "float" for v in 'AEG'}
        how = "outer"
        on = ['A']

        dfs = self.make_mergeable_test_dfs([dtypes1, dtypes2], on, n_rows)

        dfs[0].iloc[2, dfs[0].columns.get_loc(on[0])] = np.NaN
        dfs[1].iloc[2, dfs[1].columns.get_loc(on[0])] = np.NaN

        ref = dfs[0].merge(dfs[1], how=how, on=on)

        merged = utils.merge_frames(dfs, how=how, on=on)

        assert self.dfs_exact_match(ref, merged)
Example #5
0
    def test_merge_frames_one_frame(self, n_rows):
        """
        provide just one df
        :param n_rows: number of rows in simulated df
        :return: assertion
        """

        how = "inner"
        on = ["A"]
        suffs = ["", ""]
        dtypes = {v: "int" for v in "ABC"}

        df = self.base_merge_test(n_rows,
                                  how,
                                  on,
                                  suffs, [dtypes],
                                  get_ref=False)

        merged = utils.merge_frames(df, how=how, on=on)

        assert self.dfs_exact_match(df[0], merged)