Esempio n. 1
0
def simple_annotate_csv(
    in_f: str,
    out_f: str,
    col_name: str,
    col_val: str,
    col_dtype: str,
    write_header: bool = False,
) -> None:
    """
    Simplified version of the annotate_csv method.
    Add column with the same value for all rows.

    @param in_f:
    @param out_f:
    @param col_name:
    @param col_val:
    @param col_dtype:
    @param write_header:
    @return:
    """
    csvinput = CsverveInput(in_f)
    metrics_df = csvinput.read_csv()
    metrics_df[col_name] = col_val

    csv_dtypes = csvinput.dtypes
    csv_dtypes[col_name] = col_dtype

    output = CsverveOutputDataFrame(metrics_df,
                                    out_f,
                                    csv_dtypes,
                                    write_header=write_header)
    output.write_df()
Esempio n. 2
0
def add_col_from_dict(infile, col_data, outfile, dtypes, write_header=True):
    """
    TODO: fill this in
    Add column to gzipped CSV.

    @param infile:
    @param col_data:
    @param outfile:
    @param dtypes:
    @param write_header:
    @return:
    """

    csvinput = CsverveInput(infile)
    csv_dtypes = csvinput.dtypes
    csvinput = csvinput.read_csv()

    for col_name, col_value in col_data.items():
        csvinput[col_name] = col_value

    dtypes = utils.merge_dtypes([csv_dtypes, dtypes])
    output = CsverveOutputDataFrame(csvinput,
                                    outfile,
                                    dtypes,
                                    write_header=write_header)
    output.write_df()
Esempio n. 3
0
def merge_csv(in_filenames: Union[List[str], Dict[str, str]],
              out_filename: str,
              how: str,
              on: List[str],
              write_header: bool = True) -> None:
    """
    Create one gzipped CSV out of multiple gzipped CSVs.

    @param in_filenames: Dictionary containing file paths as keys
    @param out_filename: Path to newly merged CSV
    @param how: How to join DataFrames (inner, outer, left, right).
    @param on: Column(s) to join on, comma separated if multiple.
    @param write_header: boolean, True = write header, False = don't write header
    @return:
    """
    if isinstance(in_filenames, dict):
        in_filenames = list(in_filenames.values())

    data: List[CsverveInput] = [
        CsverveInput(infile) for infile in in_filenames
    ]

    dfs: List[str] = [csvinput.read_csv() for csvinput in data]

    dtypes: List[Dict[str, str]] = [csvinput.dtypes for csvinput in data]

    merged_data: pd.DataFrame = utils.merge_frames(dfs, how, on)

    dtypes_: Dict[str, str] = utils.merge_dtypes(dtypes)

    csvoutput: CsverveOutputDataFrame = CsverveOutputDataFrame(
        merged_data, out_filename, dtypes_, write_header=write_header)
    csvoutput.write_df()
Esempio n. 4
0
def rewrite_csv_file(
    filepath: str,
    outputfile: str,
    write_header: bool = True,
    dtypes: Dict[str, str] = None,
) -> None:
    """
    Generate header less csv files.

    @param filepath: File path of CSV.
    @param outputfile: File path of header less CSV to be generated.
    @param write_header: boolean, True = write header, False = don't write header.
    @param dtypes: Dictionary of pandas dtypes, where key = column name, value = dtype.
    @return:
    """

    if os.path.exists(filepath + '.yaml'):
        csvinput: Union[CsverveInput,
                        IrregularCsverveInput] = CsverveInput(filepath)
        df = csvinput.read_csv()

        csvoutput_df = CsverveOutputDataFrame(df,
                                              outputfile,
                                              write_header=write_header,
                                              dtypes=csvinput.dtypes)
        csvoutput_df.write_df()
    else:
        assert dtypes
        csvinput = IrregularCsverveInput(filepath, dtypes)

        csvoutput_fs = CsverveOutputFileStream(outputfile,
                                               write_header=write_header,
                                               columns=csvinput.columns,
                                               dtypes=csvinput.dtypes)
        csvoutput_fs.rewrite_csv(filepath)
Esempio n. 5
0
def annotate_csv(
    infile: str,
    annotation_df: pd.DataFrame,
    outfile,
    annotation_dtypes,
    on="cell_id",
    write_header: bool = True,
):
    """
    TODO: fill this in
    @param infile:
    @param annotation_df:
    @param outfile:
    @param annotation_dtypes:
    @param on:
    @param write_header:
    @return:
    """

    csvinput = CsverveInput(infile)
    metrics_df = csvinput.read_csv()

    # get annotation rows that correspond to rows in on
    reformed_annotation = annotation_df[annotation_df[on].isin(metrics_df[on])]

    # do nothing if the annotation df is empty
    if reformed_annotation.empty:  # so we dont add NaNs
        return write_dataframe_to_csv_and_yaml(metrics_df,
                                               outfile,
                                               csvinput.dtypes,
                                               write_header=write_header)

    metrics_df = metrics_df.merge(reformed_annotation, on=on, how='outer')

    csv_dtypes = csvinput.dtypes

    for col, dtype in csv_dtypes.items():
        if col in annotation_dtypes:
            assert dtype == annotation_dtypes[col]

    csv_dtypes.update(annotation_dtypes)

    output = CsverveOutputDataFrame(metrics_df,
                                    outfile,
                                    csv_dtypes,
                                    write_header=write_header)
    output.write_df()
Esempio n. 6
0
def process_annotation_file(filepath):
    data = CsverveInput(filepath).read_csv()

    data['sample_id'] = [a.split('-')[-4] for a in data['cell_id']]
    data['library_id'] = [a.split('-')[-3] for a in data['cell_id']]

    for col in _categorical_cols:
        if col in data:
            data[col] = pd.Categorical(data[col])

    return data
Esempio n. 7
0
def process_hmmcopy_data(filepath, usecols=None):
    data = CsverveInput(filepath).read_csv(usecols=usecols)

    data['sample_id'] = [a.split('-')[-4] for a in data['cell_id']]
    data['library_id'] = [a.split('-')[-3] for a in data['cell_id']]

    for col in _categorical_cols:
        if col in data:
            data[col] = pd.Categorical(data[col])

    return data
Esempio n. 8
0
def concatenate_csv(inputfiles: List[str],
                    output: str,
                    write_header: bool = True,
                    drop_duplicates: bool = False) -> None:
    """
    Concatenate gzipped CSV files, dtypes in meta YAML files must be the same.

    @param inputfiles: List of gzipped CSV file paths, or a dictionary where the keys are file paths.
    @param output: Path of resulting concatenated gzipped CSV file and meta YAML.
    @param write_header: boolean, True = write header, False = don't write header.
    @return:
    """
    if isinstance(inputfiles, dict):
        inputfiles = list(inputfiles.values())

    if inputfiles == []:
        raise CsverveConcatException("nothing provided to concat")

    inputs: List[CsverveInput] = [
        CsverveInput(infile) for infile in inputfiles
    ]

    dtypes: Dict[str, str] = utils.merge_dtypes(
        [csvinput.dtypes for csvinput in inputs])

    headers: List[bool] = [csvinput.header for csvinput in inputs]

    columns: List[List[str]] = [csvinput.columns for csvinput in inputs]

    low_memory: bool = True
    if any(headers):
        low_memory = False

    if not all(columns[0] == elem for elem in columns):
        low_memory = False

    if drop_duplicates:
        low_memory = False

    if low_memory:
        concatenate_csv_files_quick_lowmem(inputfiles,
                                           output,
                                           dtypes,
                                           columns[0],
                                           write_header=write_header)
    else:
        concatenate_csv_files_pandas(inputfiles,
                                     output,
                                     dtypes,
                                     write_header=write_header,
                                     drop_duplicates=drop_duplicates)
Esempio n. 9
0
def read_csv(infile: str,
             chunksize: int = None,
             usecols=None,
             dtype=None) -> pd.DataFrame:
    """
    Read in CSV file and return as a pandas DataFrame.

    Assumes a YAML meta file in the same path with the same name, with a .yaml extension.
    YAML file structure is atop this file.

    @param infile: Path to CSV file.
    @param chunksize: Number of rows to read at a time (optional, applies to large datasets).
    @param usecols: Restrict to specific columns (optional).
    @param dtype: Override the dtypes on specific columns (optional).
    @return: pandas DataFrame.
    """
    return CsverveInput(infile).read_csv(chunksize=chunksize,
                                         usecols=usecols,
                                         dtype=dtype)
Esempio n. 10
0
def get_dtypes(infile):
    return CsverveInput(infile).dtypes
Esempio n. 11
0
def get_columns(infile):
    return CsverveInput(infile).columns