def concatenate_csv(inputs, output, low_memory=False):
    if low_memory:
        csvutils.concatenate_csv_files_quick_lowmem(
            inputs, output
        )
    else:
        csvutils.concatenate_csv(
            inputs, output
        )
Beispiel #2
0
def concatenate_csv(inputs, output, data_type, low_memory=False):
    ref_dtypes = None
    if data_type:
        ref_dtypes = dtypes()[data_type]

    if low_memory:
        csvutils.concatenate_csv_files_quick_lowmem(inputs,
                                                    output,
                                                    dtypes=ref_dtypes)
    else:
        csvutils.concatenate_csv(inputs, output, dtypes=ref_dtypes)
def collect_gc(infiles, outfile, tempdir):
    helpers.makedirs(tempdir)

    tempouts = []
    for cell_id, infile in infiles.items():
        tempout = os.path.join(tempdir, "{}.parsed.csv".format(cell_id))
        tempouts.append(tempout)
        gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE',
                                  cell_id, 'gcbias')
        gen_gc.main()

    csvutils.concatenate_csv(tempouts, outfile, dtypes=dtypes()['metrics'])
Beispiel #4
0
    def test_concat_csv_multiple_files_to_concat(self, tmpdir, n_rows, n_frames):
        """
        provide just 1 file to concat
        """
        dtypes = [{v: "int" for v in 'ABCD'} for _ in range(n_frames)]
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, dtypes, write=True,
                                               get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Beispiel #5
0
    def test_concat_csv_input_as_dict(self, tmpdir, n_rows):
        """
        test concating a dictionary of csvs
        :param tmpdir: temp dir to test in
        :param n_rows: length of test dfs
        :return:
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True,
                                               get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv({"a": csvs[0], "b": csvs[1]}, concatenated)
Beispiel #6
0
    def test_concat_csv_empty_inputs(self, tmpdir):
        """
        test concatenate csv with data-less input csvs
        :param tmpdir: tempdir to test in
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(0, [dtypes, dtypes], write=True,
                                               get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(concatenated, ref)
Beispiel #7
0
    def test_concat_csv_different_cols(self, tmpdir, n_rows):
        """
        concat two dataframes with different columns
        """
        dtypes1 = {v: "float" for v in 'ABCD'}
        dtypes2 = {v: "float" for v in 'ABGF'}

        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes1, dtypes2], write=True,
                                               get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Beispiel #8
0
    def test_concat_csv(self, tmpdir, n_rows):
        """
        basic sanity check - concat two csvs with same cols
        :param tmpdir: temporary directory to write in
        :param n_rows: number of rows in test csvs
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True,
                                               get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Beispiel #9
0
    def test_concat_csv_one_file_to_concat(self, tmpdir, n_rows):
        """
        provide just 1 file to concat
        :param tmpdir: temp dir to test in
        :param n_rows: length of test dfs
        :return:
        """
        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        df, csv, ref = self.base_test_concat(n_rows, [dtypes], write=True,
                                             get_ref=True, dir=tmpdir)

        csvutils.concatenate_csv(csv, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Beispiel #10
0
def collect_metrics(flagstat_metrics, markdups_metrics, insert_metrics,
                    wgs_metrics, tempdir, merged_metrics):
    helpers.makedirs(tempdir)
    sample_outputs = []
    for sample in flagstat_metrics.keys():
        flgstat = flagstat_metrics[sample]
        mkdup = markdups_metrics[sample]
        insrt = insert_metrics[sample]
        wgs = wgs_metrics[sample]
        outfile = os.path.join(tempdir, sample + "_metrics.csv")
        sample_outputs.append(outfile)

        collmet = CollectMetrics(wgs, insrt, flgstat,
                                 mkdup, outfile, sample)
        collmet.main()

    csvutils.concatenate_csv(sample_outputs, merged_metrics)
Beispiel #11
0
def collect_gc(infiles, outfile, tempdir):

    helpers.makedirs(tempdir)

    tempouts = []
    for cell_id, infile in infiles.iteritems():
        tempout = os.path.join(tempdir,
                               os.path.basename(infile) + ".parsed.csv")
        tempouts.append(tempout)
        gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE',
                                  cell_id, 'gcbias')
        gen_gc.main()

    merged_csv = os.path.join(tempdir, "merged_gc_metrics.csv")
    csvutils.concatenate_csv(tempouts, merged_csv)

    hdfutils.convert_csv_to_hdf(merged_csv, outfile, '/alignment/gc_metrics')
Beispiel #12
0
    def test_concat_csv_no_header(self, tmpdir, n_rows):
        """
        test concating csvs with no headers
        :param tmpdir: temporary directory to write in
        :param n_rows: number of rows in test csvs
        """

        dtypes = {v: "int" for v in 'ABCD'}
        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs, csvs, ref = self.base_test_concat(n_rows, [dtypes, dtypes], write=True,
                                               get_ref=True, dir=tmpdir,
                                               write_head=False)
        csvutils.concatenate_csv(csvs, concatenated, write_header=False)

        assert self.dfs_exact_match(ref, concatenated)

        concatenated = pd.read_csv(concatenated)  # ignore separate yaml

        assert all([col not in concatenated.columns.tolist()
                    for col in dtypes.keys()])
Beispiel #13
0
    def test_concat_csv_with_nans(self, tmpdir, n_rows):
        """
        concat two csvs with NaNs
        """
        dtypes = {v: "float" for v in 'ABCD'}

        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs = self.make_test_dfs([dtypes, dtypes], n_rows)
        csvs = [os.path.join(tmpdir, "0.csv.gz"),
                os.path.join(tmpdir, "1.csv.gz")]

        dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN
        dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN

        csvutils.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes)
        csvutils.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes)

        ref = pd.concat(dfs, ignore_index=True)
        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
Beispiel #14
0
def merge_csvs(input_csvs, merged_csv):
    """
    merges input csv files
    into one csv
    """
    csvutils.concatenate_csv(input_csvs, merged_csv, write_header=True)
def concatenate_csv(inputs, output):
    csvutils.concatenate_csv(
        inputs,
        output,
        write_header=True
    )