Ejemplo n.º 1
0
def merge_csv(in_filenames: Union[List[str], Dict[str, str]],
              out_filename: str,
              how: str,
              on: List[str],
              write_header: bool = True) -> None:
    """
    Create one gzipped CSV out of multiple gzipped CSVs.

    @param in_filenames: Dictionary containing file paths as keys
    @param out_filename: Path to newly merged CSV
    @param how: How to join DataFrames (inner, outer, left, right).
    @param on: Column(s) to join on, comma separated if multiple.
    @param write_header: boolean, True = write header, False = don't write header
    @return:
    """
    if isinstance(in_filenames, dict):
        in_filenames = list(in_filenames.values())

    data: List[CsverveInput] = [
        CsverveInput(infile) for infile in in_filenames
    ]

    dfs: List[str] = [csvinput.read_csv() for csvinput in data]

    dtypes: List[Dict[str, str]] = [csvinput.dtypes for csvinput in data]

    merged_data: pd.DataFrame = utils.merge_frames(dfs, how, on)

    dtypes_: Dict[str, str] = utils.merge_dtypes(dtypes)

    csvoutput: CsverveOutputDataFrame = CsverveOutputDataFrame(
        merged_data, out_filename, dtypes_, write_header=write_header)
    csvoutput.write_df()
Ejemplo n.º 2
0
def add_col_from_dict(infile, col_data, outfile, dtypes, write_header=True):
    """
    TODO: fill this in
    Add column to gzipped CSV.

    @param infile:
    @param col_data:
    @param outfile:
    @param dtypes:
    @param write_header:
    @return:
    """

    csvinput = CsverveInput(infile)
    csv_dtypes = csvinput.dtypes
    csvinput = csvinput.read_csv()

    for col_name, col_value in col_data.items():
        csvinput[col_name] = col_value

    dtypes = utils.merge_dtypes([csv_dtypes, dtypes])
    output = CsverveOutputDataFrame(csvinput,
                                    outfile,
                                    dtypes,
                                    write_header=write_header)
    output.write_df()
Ejemplo n.º 3
0
    def test_merge_dtypes_one_given(self):
        """
        test merging of list of 1 dtype dict
        """
        dtypes1 = {v: "float" for v in 'ACD'}
        ref = dtypes1

        merged_dtypes = utils.merge_dtypes([dtypes1])

        assert ref == merged_dtypes
Ejemplo n.º 4
0
    def merge_dtype_test_types_mixed():
        """
        test merge dtypes that have mixed typing
        """
        types = ["int", "float", "bool", "str"]
        dtypes1 = {v: random.choice(types) for v in "ACD"}
        dtypes2 = {v: random.choice(types) for v in "ACD"}

        ref = {v: type for v in set(dtypes1.keys()).union(set(dtypes2.keys()))}

        merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2])

        assert ref == merged_dtypes
Ejemplo n.º 5
0
def concatenate_csv(inputfiles: List[str],
                    output: str,
                    write_header: bool = True,
                    drop_duplicates: bool = False) -> None:
    """
    Concatenate gzipped CSV files, dtypes in meta YAML files must be the same.

    @param inputfiles: List of gzipped CSV file paths, or a dictionary where the keys are file paths.
    @param output: Path of resulting concatenated gzipped CSV file and meta YAML.
    @param write_header: boolean, True = write header, False = don't write header.
    @return:
    """
    if isinstance(inputfiles, dict):
        inputfiles = list(inputfiles.values())

    if inputfiles == []:
        raise CsverveConcatException("nothing provided to concat")

    inputs: List[CsverveInput] = [
        CsverveInput(infile) for infile in inputfiles
    ]

    dtypes: Dict[str, str] = utils.merge_dtypes(
        [csvinput.dtypes for csvinput in inputs])

    headers: List[bool] = [csvinput.header for csvinput in inputs]

    columns: List[List[str]] = [csvinput.columns for csvinput in inputs]

    low_memory: bool = True
    if any(headers):
        low_memory = False

    if not all(columns[0] == elem for elem in columns):
        low_memory = False

    if drop_duplicates:
        low_memory = False

    if low_memory:
        concatenate_csv_files_quick_lowmem(inputfiles,
                                           output,
                                           dtypes,
                                           columns[0],
                                           write_header=write_header)
    else:
        concatenate_csv_files_pandas(inputfiles,
                                     output,
                                     dtypes,
                                     write_header=write_header,
                                     drop_duplicates=drop_duplicates)
Ejemplo n.º 6
0
    def merge_dtype_test_types_different(dtype):
        """
        test merge dtypes that are different
        """
        dtypes1 = {v: dtype for v in 'ACD'}
        dtypes2 = {v: dtype for v in 'ACDEF'}
        ref = {
            v: dtype
            for v in set(dtypes1.keys()).union(set(dtypes2.keys()))
        }

        merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2])

        assert ref == merged_dtypes
Ejemplo n.º 7
0
    def test_merge_dtypes(self):
        """
        basic sanity check - test merging of two dtype dicts
        """
        dtypes1 = {v: "float" for v in 'ACD'}
        dtypes2 = {v: "float" for v in 'ACDEF'}
        ref = {
            v: "float"
            for v in set(dtypes1.keys()).union(set(dtypes2.keys()))
        }

        merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2])

        assert ref == merged_dtypes
Ejemplo n.º 8
0
    def test_merge_dtypes_multiple_given(self, n_dtypes):
        """
        test merging of n_dtypes dtype dicts
        :param n_dtypes: number of dtypes to merge
        """
        dtypes = [{v: "int"
                   for v in "".join(self._str_list(3, "A"))}
                  for _ in range(n_dtypes)]

        # taken from https://stackoverflow.com/questions/9819602/union-of-dict-objects-in-python
        ref = dict(itertools.chain.from_iterable(dct.items()
                                                 for dct in dtypes))

        merged_dtypes = utils.merge_dtypes(dtypes)

        assert ref == merged_dtypes