Example #1
0
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts,
                              merged_detailed_counts, merged_summary_counts):
    if isinstance(all_detailed_counts, dict):
        all_detailed_counts = all_detailed_counts.values()

    detailed_data = []
    for countsfile in all_detailed_counts:
        if os.stat(countsfile).st_size == 0:
            continue
        detailed_data.append(pd.read_csv(countsfile))

    if len(detailed_data) > 0:
        df = pd.concat(detailed_data)
    else:
        df = pd.DataFrame(
            columns=["cell_id", "readend", "human", "mouse", "count"])
    index_cols = [v for v in df.columns.values if v != "count"]

    df['count'] = df.groupby(index_cols)['count'].transform('sum')

    df = df.drop_duplicates(subset=index_cols)

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_detailed_counts,
                                             write_header=True,
                                             dtypes=dtypes())

    if isinstance(all_summary_counts, dict):
        all_summary_counts = all_summary_counts.values()

    summary_counts = [
        pd.read_csv(countsfile) for countsfile in all_summary_counts
    ]

    if len(summary_counts) > 0:
        df = pd.concat(summary_counts)
    else:
        df = pd.DataFrame(columns - ["cell_id", "fastqscreen_nohit"])

    update_cols = [v for v in df.columns.values if v != 'cell_id']

    for colname in update_cols:
        df[colname] = df.groupby('cell_id')[colname].transform('sum')

    df = df.drop_duplicates(subset=['cell_id'])

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_summary_counts,
                                             write_header=True,
                                             dtypes=dtypes())
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts,
                              merged_detailed_counts, merged_summary_counts):
    if isinstance(all_detailed_counts, dict):
        all_detailed_counts = all_detailed_counts.values()

    detailed_data = []
    for countsfile in all_detailed_counts:
        if os.stat(countsfile).st_size == 0:
            continue
        detailed_data.append(pd.read_csv(countsfile))

    df = pd.concat(detailed_data)

    index_cols = [v for v in df.columns.values if v != "count"]

    df['count'] = df.groupby(index_cols)['count'].transform('sum')

    df = df.drop_duplicates(subset=index_cols)

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_detailed_counts,
                                             dtypes()['fastqscreen_detailed'],
                                             write_header=True)

    if isinstance(all_summary_counts, dict):
        all_summary_counts = all_summary_counts.values()

    summary_counts = [
        pd.read_csv(countsfile) for countsfile in all_summary_counts
    ]

    df = pd.concat(summary_counts)

    update_cols = [v for v in df.columns.values if v != 'cell_id']

    for colname in update_cols:
        df[colname] = df.groupby('cell_id')[colname].transform('sum')

    df = df.drop_duplicates(subset=['cell_id'])

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_summary_counts,
                                             dtypes()['metrics'],
                                             write_header=True)
def add_contamination_status(infile,
                             outfile,
                             reference='grch37',
                             ref_threshold=0.6,
                             alt_threshold=0.2,
                             strict_validation=True):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    fastqscreen_cols = [
        col for col in data.columns.values if col.startswith('fastqscreen_')
    ]

    reference = "fastqscreen_{}".format(reference)
    if reference not in fastqscreen_cols:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in fastqscreen_cols if not col == reference]

    data['is_contaminated'] = False

    perc_ref = data[reference] / data['total_reads']
    data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True

    for altcol in alts:
        perc_alt = data[altcol] / data['total_reads']
        data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']
    data['is_contaminated'] = data['is_contaminated'].astype(col_type)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             outfile,
                                             write_header=True,
                                             dtypes=dtypes()['metrics'])

    # get cells that are contaminated and have enopugh human reads
    check_df = data.loc[data['is_contaminated'] == True]
    check_df['perc_ref'] = data[reference] / data['total_reads']
    check_df = check_df[check_df['perc_ref'] > ref_threshold]
    if strict_validation and (len(check_df) / len(data) > 0.2):
        logging.error("over 20% of cells are contaminated")
def collect_gc(infiles, outfile, tempdir):
    helpers.makedirs(tempdir)

    tempouts = []
    for cell_id, infile in infiles.items():
        tempout = os.path.join(tempdir, "{}.parsed.csv".format(cell_id))
        tempouts.append(tempout)
        gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE',
                                  cell_id, 'gcbias')
        gen_gc.main()

    csvutils.concatenate_csv(tempouts, outfile, dtypes=dtypes()['metrics'])
Example #5
0
def annotate_coverage_metrics(metrics, coverage_yaml, output):
    data = {}

    for cell_id, filename in coverage_yaml.items():
        with open(filename, 'rt') as reader:
            covdata = yaml.load(reader)
            if 'cell_id' in covdata:
                assert covdata['cell_id'] == cell_id
                del covdata['cell_id']
            data[cell_id] = covdata

    csvutils.annotate_csv(metrics, data, output, dtypes()['metrics'])
Example #6
0
def collect_metrics(flagstat_metrics, markdups_metrics, insert_metrics,
                    wgs_metrics, tempdir, merged_metrics):
    helpers.makedirs(tempdir)
    sample_outputs = []

    for sample in flagstat_metrics.keys():
        flgstat = flagstat_metrics[sample]
        mkdup = markdups_metrics[sample]
        insrt = insert_metrics[sample]
        wgs = wgs_metrics[sample]
        outfile = os.path.join(tempdir, sample + "_metrics.csv.gz")
        sample_outputs.append(outfile)

        collmet = CollectMetrics(wgs, insrt, flgstat, mkdup, outfile, sample,
                                 dtypes()['metrics'])
        collmet.main()

    csvutils.concatenate_csv(sample_outputs, merged_metrics)
Example #7
0
def bam_metrics_workflow(bam_filename, summary_fastq_screen_count_per_cell,
                         alignment_metrics, gc_metrics,
                         markdups_metrics_percell, flagstat_metrics_percell,
                         wgs_metrics_percell, gc_metrics_percell,
                         gc_metrics_summary_percell, gc_metrics_pdf_percell,
                         insert_metrics_percell, insert_metrics_pdf_percell,
                         ref_genome, sample_info, config, cell_ids):
    markdups_metrics_percell = dict([(cellid, markdups_metrics_percell[cellid])
                                     for cellid in cell_ids])

    flagstat_metrics_percell = dict([(cellid, flagstat_metrics_percell[cellid])
                                     for cellid in cell_ids])

    wgs_metrics_percell = dict([(cellid, wgs_metrics_percell[cellid])
                                for cellid in cell_ids])

    gc_metrics_percell = dict([(cellid, gc_metrics_percell[cellid])
                               for cellid in cell_ids])

    gc_metrics_summary_percell = dict([
        (cellid, gc_metrics_summary_percell[cellid]) for cellid in cell_ids
    ])

    gc_metrics_pdf_percell = dict([(cellid, gc_metrics_pdf_percell[cellid])
                                   for cellid in cell_ids])

    insert_metrics_percell = dict([(cellid, insert_metrics_percell[cellid])
                                   for cellid in cell_ids])

    insert_metrics_pdf_percell = dict([
        (cellid, insert_metrics_pdf_percell[cellid]) for cellid in cell_ids
    ])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='get_duplication_wgs_flagstat_metrics',
        axes=('cell_id', ),
        func="single_cell.workflows.align.tasks.picard_wgs_dup",
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            mgd.TempOutputFile("temp_markdup_bam.bam", 'cell_id'),
            mgd.OutputFile('markdups_metrics',
                           'cell_id',
                           fnames=markdups_metrics_percell),
            mgd.TempSpace('tempdir_markdups', 'cell_id'),
            ref_genome,
            mgd.OutputFile('wgs_metrics_percell',
                           'cell_id',
                           fnames=wgs_metrics_percell),
            config['picard_wgs_params'],
        ),
    )

    workflow.transform(
        name='bam_collect_gc_insert_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.align.tasks.picard_insert_gc_flagstat",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename),
            ref_genome,
            mgd.OutputFile('gc_metrics_percell',
                           'cell_id',
                           fnames=gc_metrics_percell),
            mgd.OutputFile('gc_metrics_summary_percell',
                           'cell_id',
                           fnames=gc_metrics_summary_percell),
            mgd.OutputFile('gc_metrics_pdf_percell',
                           'cell_id',
                           fnames=gc_metrics_pdf_percell),
            mgd.TempSpace('gc_tempdir', 'cell_id'),
            mgd.OutputFile('flagstat_metrics_percell',
                           'cell_id',
                           fnames=flagstat_metrics_percell),
            mgd.OutputFile('insert_metrics_percell',
                           'cell_id',
                           fnames=insert_metrics_percell),
            mgd.OutputFile('insert_metrics_pdf_percell',
                           'cell_id',
                           fnames=insert_metrics_pdf_percell),
        ),
    )

    workflow.transform(
        name='bam_coverage_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.align.coverage_metrics.get_coverage_data",
        axes=('cell_id', ),
        args=(mgd.InputFile('sorted_markdups',
                            'cell_id',
                            fnames=bam_filename,
                            extensions=['.bai']),
              mgd.TempOutputFile('coverage_metrics.yaml',
                                 'cell_id'), mgd.InputInstance('cell_id')),
    )

    workflow.transform(
        name="collect_gc_metrics",
        func="single_cell.workflows.align.tasks.collect_gc",
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        args=(mgd.InputFile('gc_metrics_percell',
                            'cell_id',
                            fnames=gc_metrics_percell),
              mgd.OutputFile(gc_metrics,
                             extensions=['.yaml']), mgd.TempSpace("temp_gc")),
    )

    workflow.transform(
        name='collect_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.workflows.align.tasks.collect_metrics",
        args=(
            mgd.InputFile('flagstat_metrics',
                          'cell_id',
                          axes_origin=[],
                          fnames=flagstat_metrics_percell),
            mgd.InputFile('markdups_metrics',
                          'cell_id',
                          axes_origin=[],
                          fnames=markdups_metrics_percell),
            mgd.InputFile('insert_metrics_percell',
                          'cell_id',
                          axes_origin=[],
                          fnames=insert_metrics_percell),
            mgd.InputFile('wgs_metrics_percell',
                          'cell_id',
                          axes_origin=[],
                          fnames=wgs_metrics_percell),
            mgd.TempSpace("tempdir_collect_metrics"),
            mgd.TempOutputFile("alignment_metrics.csv.gz",
                               extensions=['.yaml']),
        ),
    )

    workflow.transform(name='annotate_metrics',
                       ctx={
                           'mem': config['memory']['med'],
                           'ncpus': 1
                       },
                       func="single_cell.utils.csvutils.annotate_csv",
                       args=(
                           mgd.TempInputFile("alignment_metrics.csv.gz",
                                             extensions=['.yaml']),
                           sample_info,
                           mgd.TempOutputFile(
                               'alignment_metrics_annotated.csv.gz',
                               extensions=['.yaml']),
                       ),
                       kwargs={'annotation_dtypes': dtypes()['metrics']})

    workflow.transform(
        name='annotate_coverage_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func=
        "single_cell.workflows.align.coverage_metrics.annotate_coverage_metrics",
        args=(
            mgd.TempInputFile("alignment_metrics_annotated.csv.gz",
                              extensions=['.yaml']),
            mgd.TempInputFile('coverage_metrics.yaml', 'cell_id'),
            mgd.TempOutputFile('alignment_metrics_annotated_coverage.csv.gz',
                               extensions=['.yaml']),
        ))

    workflow.transform(
        name='add_fastqscreen_metrics',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.merge_csv",
        args=(
            [
                mgd.TempInputFile(
                    "alignment_metrics_annotated_coverage.csv.gz",
                    extensions=['.yaml']),
                mgd.InputFile(summary_fastq_screen_count_per_cell,
                              extensions=['.yaml']),
            ],
            mgd.OutputFile(alignment_metrics, extensions=['.yaml']),
            'outer',
            ['cell_id'],
        ),
    )

    return workflow