def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts, merged_detailed_counts, merged_summary_counts): if isinstance(all_detailed_counts, dict): all_detailed_counts = all_detailed_counts.values() detailed_data = [] for countsfile in all_detailed_counts: if os.stat(countsfile).st_size == 0: continue detailed_data.append(pd.read_csv(countsfile)) if len(detailed_data) > 0: df = pd.concat(detailed_data) else: df = pd.DataFrame( columns=["cell_id", "readend", "human", "mouse", "count"]) index_cols = [v for v in df.columns.values if v != "count"] df['count'] = df.groupby(index_cols)['count'].transform('sum') df = df.drop_duplicates(subset=index_cols) csvutils.write_dataframe_to_csv_and_yaml(df, merged_detailed_counts, write_header=True, dtypes=dtypes()) if isinstance(all_summary_counts, dict): all_summary_counts = all_summary_counts.values() summary_counts = [ pd.read_csv(countsfile) for countsfile in all_summary_counts ] if len(summary_counts) > 0: df = pd.concat(summary_counts) else: df = pd.DataFrame(columns - ["cell_id", "fastqscreen_nohit"]) update_cols = [v for v in df.columns.values if v != 'cell_id'] for colname in update_cols: df[colname] = df.groupby('cell_id')[colname].transform('sum') df = df.drop_duplicates(subset=['cell_id']) csvutils.write_dataframe_to_csv_and_yaml(df, merged_summary_counts, write_header=True, dtypes=dtypes())
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts, merged_detailed_counts, merged_summary_counts): if isinstance(all_detailed_counts, dict): all_detailed_counts = all_detailed_counts.values() detailed_data = [] for countsfile in all_detailed_counts: if os.stat(countsfile).st_size == 0: continue detailed_data.append(pd.read_csv(countsfile)) df = pd.concat(detailed_data) index_cols = [v for v in df.columns.values if v != "count"] df['count'] = df.groupby(index_cols)['count'].transform('sum') df = df.drop_duplicates(subset=index_cols) csvutils.write_dataframe_to_csv_and_yaml(df, merged_detailed_counts, dtypes()['fastqscreen_detailed'], write_header=True) if isinstance(all_summary_counts, dict): all_summary_counts = all_summary_counts.values() summary_counts = [ pd.read_csv(countsfile) for countsfile in all_summary_counts ] df = pd.concat(summary_counts) update_cols = [v for v in df.columns.values if v != 'cell_id'] for colname in update_cols: df[colname] = df.groupby('cell_id')[colname].transform('sum') df = df.drop_duplicates(subset=['cell_id']) csvutils.write_dataframe_to_csv_and_yaml(df, merged_summary_counts, dtypes()['metrics'], write_header=True)
def add_contamination_status(infile, outfile, reference='grch37', ref_threshold=0.6, alt_threshold=0.2, strict_validation=True): data = csvutils.read_csv_and_yaml(infile) data = data.set_index('cell_id', drop=False) fastqscreen_cols = [ col for col in data.columns.values if col.startswith('fastqscreen_') ] reference = "fastqscreen_{}".format(reference) if reference not in fastqscreen_cols: raise Exception("Could not find the fastq screen counts") alts = [col for col in fastqscreen_cols if not col == reference] data['is_contaminated'] = False perc_ref = data[reference] / data['total_reads'] data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True for altcol in alts: perc_alt = data[altcol] / data['total_reads'] data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True col_type = dtypes()['metrics']['is_contaminated'] data['is_contaminated'] = data['is_contaminated'].astype(col_type) csvutils.write_dataframe_to_csv_and_yaml(data, outfile, write_header=True, dtypes=dtypes()['metrics']) # get cells that are contaminated and have enopugh human reads check_df = data.loc[data['is_contaminated'] == True] check_df['perc_ref'] = data[reference] / data['total_reads'] check_df = check_df[check_df['perc_ref'] > ref_threshold] if strict_validation and (len(check_df) / len(data) > 0.2): logging.error("over 20% of cells are contaminated")
def collect_gc(infiles, outfile, tempdir): helpers.makedirs(tempdir) tempouts = [] for cell_id, infile in infiles.items(): tempout = os.path.join(tempdir, "{}.parsed.csv".format(cell_id)) tempouts.append(tempout) gen_gc = GenerateCNMatrix(infile, tempout, ',', 'NORMALIZED_COVERAGE', cell_id, 'gcbias') gen_gc.main() csvutils.concatenate_csv(tempouts, outfile, dtypes=dtypes()['metrics'])
def annotate_coverage_metrics(metrics, coverage_yaml, output): data = {} for cell_id, filename in coverage_yaml.items(): with open(filename, 'rt') as reader: covdata = yaml.load(reader) if 'cell_id' in covdata: assert covdata['cell_id'] == cell_id del covdata['cell_id'] data[cell_id] = covdata csvutils.annotate_csv(metrics, data, output, dtypes()['metrics'])
def collect_metrics(flagstat_metrics, markdups_metrics, insert_metrics, wgs_metrics, tempdir, merged_metrics): helpers.makedirs(tempdir) sample_outputs = [] for sample in flagstat_metrics.keys(): flgstat = flagstat_metrics[sample] mkdup = markdups_metrics[sample] insrt = insert_metrics[sample] wgs = wgs_metrics[sample] outfile = os.path.join(tempdir, sample + "_metrics.csv.gz") sample_outputs.append(outfile) collmet = CollectMetrics(wgs, insrt, flgstat, mkdup, outfile, sample, dtypes()['metrics']) collmet.main() csvutils.concatenate_csv(sample_outputs, merged_metrics)
def bam_metrics_workflow(bam_filename, summary_fastq_screen_count_per_cell, alignment_metrics, gc_metrics, markdups_metrics_percell, flagstat_metrics_percell, wgs_metrics_percell, gc_metrics_percell, gc_metrics_summary_percell, gc_metrics_pdf_percell, insert_metrics_percell, insert_metrics_pdf_percell, ref_genome, sample_info, config, cell_ids): markdups_metrics_percell = dict([(cellid, markdups_metrics_percell[cellid]) for cellid in cell_ids]) flagstat_metrics_percell = dict([(cellid, flagstat_metrics_percell[cellid]) for cellid in cell_ids]) wgs_metrics_percell = dict([(cellid, wgs_metrics_percell[cellid]) for cellid in cell_ids]) gc_metrics_percell = dict([(cellid, gc_metrics_percell[cellid]) for cellid in cell_ids]) gc_metrics_summary_percell = dict([ (cellid, gc_metrics_summary_percell[cellid]) for cellid in cell_ids ]) gc_metrics_pdf_percell = dict([(cellid, gc_metrics_pdf_percell[cellid]) for cellid in cell_ids]) insert_metrics_percell = dict([(cellid, insert_metrics_percell[cellid]) for cellid in cell_ids]) insert_metrics_pdf_percell = dict([ (cellid, insert_metrics_pdf_percell[cellid]) for cellid in cell_ids ]) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='get_duplication_wgs_flagstat_metrics', axes=('cell_id', ), func="single_cell.workflows.align.tasks.picard_wgs_dup", args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.TempOutputFile("temp_markdup_bam.bam", 'cell_id'), mgd.OutputFile('markdups_metrics', 'cell_id', fnames=markdups_metrics_percell), mgd.TempSpace('tempdir_markdups', 'cell_id'), ref_genome, mgd.OutputFile('wgs_metrics_percell', 'cell_id', fnames=wgs_metrics_percell), config['picard_wgs_params'], ), ) workflow.transform( name='bam_collect_gc_insert_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.align.tasks.picard_insert_gc_flagstat", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), ref_genome, mgd.OutputFile('gc_metrics_percell', 'cell_id', fnames=gc_metrics_percell), mgd.OutputFile('gc_metrics_summary_percell', 'cell_id', fnames=gc_metrics_summary_percell), mgd.OutputFile('gc_metrics_pdf_percell', 'cell_id', fnames=gc_metrics_pdf_percell), mgd.TempSpace('gc_tempdir', 'cell_id'), mgd.OutputFile('flagstat_metrics_percell', 'cell_id', fnames=flagstat_metrics_percell), mgd.OutputFile('insert_metrics_percell', 'cell_id', fnames=insert_metrics_percell), mgd.OutputFile('insert_metrics_pdf_percell', 'cell_id', fnames=insert_metrics_pdf_percell), ), ) workflow.transform( name='bam_coverage_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.align.coverage_metrics.get_coverage_data", axes=('cell_id', ), args=(mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename, extensions=['.bai']), mgd.TempOutputFile('coverage_metrics.yaml', 'cell_id'), mgd.InputInstance('cell_id')), ) workflow.transform( name="collect_gc_metrics", func="single_cell.workflows.align.tasks.collect_gc", ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, args=(mgd.InputFile('gc_metrics_percell', 'cell_id', fnames=gc_metrics_percell), mgd.OutputFile(gc_metrics, extensions=['.yaml']), mgd.TempSpace("temp_gc")), ) workflow.transform( name='collect_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.workflows.align.tasks.collect_metrics", args=( mgd.InputFile('flagstat_metrics', 'cell_id', axes_origin=[], fnames=flagstat_metrics_percell), mgd.InputFile('markdups_metrics', 'cell_id', axes_origin=[], fnames=markdups_metrics_percell), mgd.InputFile('insert_metrics_percell', 'cell_id', axes_origin=[], fnames=insert_metrics_percell), mgd.InputFile('wgs_metrics_percell', 'cell_id', axes_origin=[], fnames=wgs_metrics_percell), mgd.TempSpace("tempdir_collect_metrics"), mgd.TempOutputFile("alignment_metrics.csv.gz", extensions=['.yaml']), ), ) workflow.transform(name='annotate_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.csvutils.annotate_csv", args=( mgd.TempInputFile("alignment_metrics.csv.gz", extensions=['.yaml']), sample_info, mgd.TempOutputFile( 'alignment_metrics_annotated.csv.gz', extensions=['.yaml']), ), kwargs={'annotation_dtypes': dtypes()['metrics']}) workflow.transform( name='annotate_coverage_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func= "single_cell.workflows.align.coverage_metrics.annotate_coverage_metrics", args=( mgd.TempInputFile("alignment_metrics_annotated.csv.gz", extensions=['.yaml']), mgd.TempInputFile('coverage_metrics.yaml', 'cell_id'), mgd.TempOutputFile('alignment_metrics_annotated_coverage.csv.gz', extensions=['.yaml']), )) workflow.transform( name='add_fastqscreen_metrics', ctx={ 'mem': config['memory']['med'], 'ncpus': 1 }, func="single_cell.utils.csvutils.merge_csv", args=( [ mgd.TempInputFile( "alignment_metrics_annotated_coverage.csv.gz", extensions=['.yaml']), mgd.InputFile(summary_fastq_screen_count_per_cell, extensions=['.yaml']), ], mgd.OutputFile(alignment_metrics, extensions=['.yaml']), 'outer', ['cell_id'], ), ) return workflow