def merge_csv(in_filenames: Union[List[str], Dict[str, str]], out_filename: str, how: str, on: List[str], write_header: bool = True) -> None: """ Create one gzipped CSV out of multiple gzipped CSVs. @param in_filenames: Dictionary containing file paths as keys @param out_filename: Path to newly merged CSV @param how: How to join DataFrames (inner, outer, left, right). @param on: Column(s) to join on, comma separated if multiple. @param write_header: boolean, True = write header, False = don't write header @return: """ if isinstance(in_filenames, dict): in_filenames = list(in_filenames.values()) data: List[CsverveInput] = [ CsverveInput(infile) for infile in in_filenames ] dfs: List[str] = [csvinput.read_csv() for csvinput in data] dtypes: List[Dict[str, str]] = [csvinput.dtypes for csvinput in data] merged_data: pd.DataFrame = utils.merge_frames(dfs, how, on) dtypes_: Dict[str, str] = utils.merge_dtypes(dtypes) csvoutput: CsverveOutputDataFrame = CsverveOutputDataFrame( merged_data, out_filename, dtypes_, write_header=write_header) csvoutput.write_df()
def add_col_from_dict(infile, col_data, outfile, dtypes, write_header=True): """ TODO: fill this in Add column to gzipped CSV. @param infile: @param col_data: @param outfile: @param dtypes: @param write_header: @return: """ csvinput = CsverveInput(infile) csv_dtypes = csvinput.dtypes csvinput = csvinput.read_csv() for col_name, col_value in col_data.items(): csvinput[col_name] = col_value dtypes = utils.merge_dtypes([csv_dtypes, dtypes]) output = CsverveOutputDataFrame(csvinput, outfile, dtypes, write_header=write_header) output.write_df()
def test_merge_dtypes_one_given(self): """ test merging of list of 1 dtype dict """ dtypes1 = {v: "float" for v in 'ACD'} ref = dtypes1 merged_dtypes = utils.merge_dtypes([dtypes1]) assert ref == merged_dtypes
def merge_dtype_test_types_mixed(): """ test merge dtypes that have mixed typing """ types = ["int", "float", "bool", "str"] dtypes1 = {v: random.choice(types) for v in "ACD"} dtypes2 = {v: random.choice(types) for v in "ACD"} ref = {v: type for v in set(dtypes1.keys()).union(set(dtypes2.keys()))} merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2]) assert ref == merged_dtypes
def concatenate_csv(inputfiles: List[str], output: str, write_header: bool = True, drop_duplicates: bool = False) -> None: """ Concatenate gzipped CSV files, dtypes in meta YAML files must be the same. @param inputfiles: List of gzipped CSV file paths, or a dictionary where the keys are file paths. @param output: Path of resulting concatenated gzipped CSV file and meta YAML. @param write_header: boolean, True = write header, False = don't write header. @return: """ if isinstance(inputfiles, dict): inputfiles = list(inputfiles.values()) if inputfiles == []: raise CsverveConcatException("nothing provided to concat") inputs: List[CsverveInput] = [ CsverveInput(infile) for infile in inputfiles ] dtypes: Dict[str, str] = utils.merge_dtypes( [csvinput.dtypes for csvinput in inputs]) headers: List[bool] = [csvinput.header for csvinput in inputs] columns: List[List[str]] = [csvinput.columns for csvinput in inputs] low_memory: bool = True if any(headers): low_memory = False if not all(columns[0] == elem for elem in columns): low_memory = False if drop_duplicates: low_memory = False if low_memory: concatenate_csv_files_quick_lowmem(inputfiles, output, dtypes, columns[0], write_header=write_header) else: concatenate_csv_files_pandas(inputfiles, output, dtypes, write_header=write_header, drop_duplicates=drop_duplicates)
def merge_dtype_test_types_different(dtype): """ test merge dtypes that are different """ dtypes1 = {v: dtype for v in 'ACD'} dtypes2 = {v: dtype for v in 'ACDEF'} ref = { v: dtype for v in set(dtypes1.keys()).union(set(dtypes2.keys())) } merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2]) assert ref == merged_dtypes
def test_merge_dtypes(self): """ basic sanity check - test merging of two dtype dicts """ dtypes1 = {v: "float" for v in 'ACD'} dtypes2 = {v: "float" for v in 'ACDEF'} ref = { v: "float" for v in set(dtypes1.keys()).union(set(dtypes2.keys())) } merged_dtypes = utils.merge_dtypes([dtypes1, dtypes2]) assert ref == merged_dtypes
def test_merge_dtypes_multiple_given(self, n_dtypes): """ test merging of n_dtypes dtype dicts :param n_dtypes: number of dtypes to merge """ dtypes = [{v: "int" for v in "".join(self._str_list(3, "A"))} for _ in range(n_dtypes)] # taken from https://stackoverflow.com/questions/9819602/union-of-dict-objects-in-python ref = dict(itertools.chain.from_iterable(dct.items() for dct in dtypes)) merged_dtypes = utils.merge_dtypes(dtypes) assert ref == merged_dtypes