def main(args): logging.info("Converting %s%s to %s", "input" if args.infile is sys.stdin else args.infile, " from "+ args.in_fmt if args.in_fmt != 'auto' else '', args.out_fmt) if args.in_fmt == 'auto': args.in_fmt = tabio.sniff_region_format(args.infile) # Format-specific input options kwargs = {} if args.in_fmt == 'gff': if args.gff_tag: kwargs['tag'] = args.gff_tag if args.gff_type: kwargs['keep_type'] = args.gff_type elif args.in_fmt == 'refflat': if args.refflat_type == 'exon': kwargs['exons'] = True elif args.refflat_type == 'cds': kwargs['cds'] = True regions = tabio.read(args.infile, args.in_fmt, **kwargs) # Post-processing if args.flatten: regions = regions.flatten() elif args.merge: regions = regions.merge(bp=args.merge) tabio.write(regions, args.output, args.out_fmt)
def main(args): logging.info("Converting %s%s to %s", "input" if args.infile is sys.stdin else args.infile, "from "+ args.in_fmt if args.in_fmt != 'auto' else '', args.out_fmt) table = tabio.read(args.infile, args.in_fmt) tabio.write(table, args.output, args.out_fmt)
def sample_region_cov(bam_fname, regions, max_num=100): """Calculate read depth in a randomly sampled subset of regions.""" midsize_regions = sample_midsize_regions(regions, max_num) with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f: tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4') f.flush() table = coverage.bedcov(f.name, bam_fname, 0) # Mean read depth across all sampled regions return table.basecount.sum() / (table.end - table.start).sum()
def main(args): logging.info("Converting %s%s to %s", "input" if args.infile is sys.stdin else args.infile, "from " + args.in_fmt if args.in_fmt != 'auto' else '', args.out_fmt) # TODO - add back merge/flatten/exon options from refFlat2bed table = tabio.read(args.infile, args.in_fmt) tabio.write(table, args.output, args.out_fmt)
def _cmd_ztest(args): cnarr = cnvlib.read(args.cnarr) if args.segment: segments = cnvlib.read(args.segment) is_sample_female = None else: segments = None is_sample_female = verify_sample_sex(cnarr, args.sample_sex, args.male_reference) sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female, args.alpha, args.target) if len(sig): tabio.write(sig, args.output or sys.stdout)
def main(args): sample_counts = aggregate_gene_counts(args.gene_counts) sample_counts = rna.filter_probes(sample_counts) # DBG if args.output: sample_counts.to_csv(args.output + ".sample_counts.tsv", sep='\t', index=True) print("Wrote", args.output + ".sample_counts.tsv", "with", len(sample_counts), "rows") if args.correlations: logging.info("Loading gene metadata " "and TCGA gene expression/CNV profiles") else: logging.info("Loading gene metadata") gene_info = rna.load_gene_info(args.gene_resource, args.correlations) print("Aligning gene info to sample gene counts") gene_info, sample_counts, sample_data_log2 = rna.align_gene_info_to_samples( gene_info, sample_counts, None) print("Writing output files") # Summary table has log2-normalized values, not raw counts # ENH show both, with column header suffixes to distinguish? all_data = pd.concat([gene_info, sample_data_log2], axis=1) if args.output: all_data.to_csv(args.output, sep='\t', index=True) print("Wrote", args.output, "with", len(all_data), "rows") else: print(all_data.describe(), file=sys.stderr) if args.cnr_dir: # CNVkit files have both absolute and log2-normalized read counts cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info) for cnr in cnrs: outfname = os.path.join(args.cnr_dir, cnr.sample_id + ".cnr") cnr = rna.correct_cnr(cnr) tabio.write(cnr, outfname, 'tab')
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rlibpath, by_count, skip_low, method, processes): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) sample_id = core.fbase(bam_fname) sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, processes) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, processes) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) tabio.write(cnarr, sample_pfx + '.cnr') logging.info("Segmenting %s.cnr ...", sample_pfx) segments = segmentation.do_segmentation(cnarr, 'cbs', rlibpath=rlibpath, skip_low=skip_low, processes=processes, **({ 'threshold': 1e-6 } if method == 'wgs' else {})) tabio.write(segments, sample_pfx + '.cns') if plot_scatter: scatter.do_scatter(cnarr, segments) pyplot.savefig(sample_pfx + '-scatter.pdf', format='pdf', bbox_inches="tight") logging.info("Wrote %s-scatter.pdf", sample_pfx) if plot_diagram: is_xx = cnarr.guess_xx(male_reference) outfname = sample_pfx + '-diagram.pdf' diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx), segments.shift_xx(male_reference, is_xx), 0.5, 3, outfname) logging.info("Wrote %s", outfname)
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rlibpath, by_count, skip_low, method, processes): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) sample_id = core.fbase(bam_fname) sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, processes) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, processes) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) tabio.write(cnarr, sample_pfx + '.cnr') logging.info("Segmenting %s.cnr ...", sample_pfx) segments = segmentation.do_segmentation(cnarr, 'cbs', rlibpath=rlibpath, skip_low=skip_low, processes=processes, **({'threshold': 1e-6} if method == 'wgs' else {})) tabio.write(segments, sample_pfx + '.cns') if plot_scatter: scatter.do_scatter(cnarr, segments) pyplot.savefig(sample_pfx + '-scatter.pdf', format='pdf', bbox_inches="tight") logging.info("Wrote %s-scatter.pdf", sample_pfx) if plot_diagram: is_xx = cnarr.guess_xx(male_reference) outfname = sample_pfx + '-diagram.pdf' diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx), segments.shift_xx(male_reference, is_xx), 0.5, 3, outfname) logging.info("Wrote %s", outfname)
'-g', '--gene-resource', metavar="FILE", required=True, # default="data/ensembl-gene-info.hg38.tsv", help="Ensembl BioMart-derived gene info table.") AP.add_argument('-d', '--output-dir', metavar='PATH', default='.', help="Output directory.") args = AP.parse_args() gene_info = load_gene_info(args.gene_resource, None, None) bad_genes = [ 'Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'uc_338', 'Clostridiales-1' ] gene_info = gene_info[~gene_info['gene'].isin(bad_genes)] gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')]) for seg_fname in args.seg_files: seg = tabio.read(seg_fname, 'seg') # Assign gene names to segments using genomic coordinates from gene_info seg['gene'] = genes_in_segments(seg, gene_info) outfname = os.path.join(args.output_dir, basename(seg_fname) + ".acgh.cns") tabio.write(seg, outfname, 'tab') print("Wrote", outfname)
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rscript_path, by_count, skip_low, seq_method, segment_method, processes, do_cluster, fasta=None): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) sample_id = core.fbase(bam_fname) sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, processes, fasta) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, processes, fasta) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True, do_cluster=do_cluster) tabio.write(cnarr, sample_pfx + '.cnr') logging.info("Segmenting %s.cnr ...", sample_pfx) segments = segmentation.do_segmentation(cnarr, segment_method, rscript_path=rscript_path, skip_low=skip_low, processes=processes, **({'threshold': 1e-6} if seq_method == 'wgs' else {})) logging.info("Post-processing %s.cns ...", sample_pfx) # TODO/ENH take centering shift & apply to .cnr for use in segmetrics seg_metrics = segmetrics.do_segmetrics(cnarr, segments, interval_stats=['ci'], alpha=0.5, smoothed=True) tabio.write(seg_metrics, sample_pfx + '.cns') # Remove likely false-positive breakpoints seg_call = call.do_call(seg_metrics, method="none", filters=['ci']) # Calculate another segment-level test p-value seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest']) # Finally, assign absolute copy number values to each segment seg_alltest.center_all("median") seg_final = call.do_call(seg_alltest, method="threshold") tabio.write(seg_final, sample_pfx + '.call.cns') # Test for single-bin CNVs separately seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True) tabio.write(seg_bintest, sample_pfx + '.bintest.cns') if plot_scatter: scatter.do_scatter(cnarr, seg_final) pyplot.savefig(sample_pfx + '-scatter.png', format='png', bbox_inches="tight") logging.info("Wrote %s-scatter.png", sample_pfx) if plot_diagram: is_xx = cnarr.guess_xx(male_reference) outfname = sample_pfx + '-diagram.pdf' diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx), seg_final.shift_xx(male_reference, is_xx), 0.5, 3, outfname) logging.info("Wrote %s", outfname)
def main(args): annot = tabio.read_auto(args.annotate) cnarr = read_cna(args.cnv_file) cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-') tabio.write(cnarr, args.output or sys.stdout)
AP.add_argument("cnn_files", nargs='+', help="""CNVkit coverage files to update (*.targetcoverage.cnn, *.antitargetcoverage.cnn).""") AP.add_argument("-d", "--output-dir", default=".", help="""Directory to write output .cnn files.""") AP.add_argument("-s", "--suffix", default=".updated", help="""Filename suffix to add before the '.cnn' extension in output files. [Default: %(default)s]""") args = AP.parse_args() for fname in args.cnn_files: cnarr = cnvlib.read(fname) # Convert coverage depths from log2 scale to absolute scale. # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11; # earlier than that, the average 'depth' will be about 1.0. cnarr['depth'] = np.exp2(cnarr['log2']) # Rename "Background" bins to "Antitarget" # NB: The default off-target bin name was changed in CNVkit v0.9.0 cnarr['gene'] = cnarr['gene'].replace("Background", cnvlib.params.ANTITARGET_NAME) cnarr.sort_columns() # Construct the output filename base, ext = os.path.basename(fname).rsplit('.', 1) if '.' in base: base, zone = base.rsplit('.', 1) out_fname = '.'.join((base + args.suffix, zone, ext)) else: # e.g. reference.cnn or .cnr file, no "*.targetcoverage.*" in name out_fname = '.'.join((base + args.suffix, ext)) tabio.write(cnarr, os.path.join(args.output_dir, out_fname))
def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext( os.path.basename(fasta))[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin(bam_fname, *autobin_args, bp_per_bin=50000.) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target( bait_arr, annotate, short_names, True, **({ 'avg_size': target_avg_size } if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
default=".", help="""Directory to write output .cnn files.""") AP.add_argument( "-s", "--suffix", default=".updated", help="""Filename suffix to add before the '.cnn' extension in output files. [Default: %(default)s]""") args = AP.parse_args() for fname in args.cnn_files: cnarr = cnvlib.read(fname) # Convert coverage depths from log2 scale to absolute scale. # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11; # earlier than that, the average 'depth' will be about 1.0. cnarr['depth'] = np.exp2(cnarr['log2']) # Rename "Background" bins to "Antitarget" # NB: The default off-target bin name was changed in CNVkit v0.9.0 cnarr['gene'] = cnarr['gene'].replace("Background", cnvlib.params.ANTITARGET_NAME) cnarr.sort_columns() # Construct the output filename base, ext = os.path.basename(fname).rsplit('.', 1) if '.' in base: base, zone = base.rsplit('.', 1) out_fname = '.'.join((base + args.suffix, zone, ext)) else: # e.g. reference.cnn or .cnr file, no "*.targetcoverage.*" in name out_fname = '.'.join((base + args.suffix, ext)) tabio.write(cnarr, os.path.join(args.output_dir, out_fname))
def clipped_rolling_mean(values, window): clipped = values.clip(-3, 3) smoothed = clipped.rolling(window, min_periods=1, center=True).mean() return smoothed.values def smooth_by_arm(cnarr, window): logr_chunks = [clipped_rolling_mean(cnarm['log2'], window) for _chrom, cnarm in cnarr.by_arm()] d = cnarr.data.assign(log2=np.concatenate(logr_chunks)) return cnarr.as_dataframe(d) AP = argparse.ArgumentParser(description=__doc__) AP.add_argument('cnr_fnames', nargs='+') AP.add_argument('-w', '--window', type=int, default=100, help="Window size for smoothing.") AP.add_argument('-d', '--output-dir', default='.') args = AP.parse_args() for fname in args.cnr_fnames: cnr = cnvlib.read(fname) cnr = smooth_by_arm(cnr, args.window) base, ext = os.path.basename(fname).rsplit(".", 1) outfname = "{}/{}.tsmooth{}.{}".format(args.output_dir, base, args.window, ext) tabio.write(cnr, outfname) print("Wrote", outfname, file=sys.stderr)
def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes): """Run coverage on one sample, write to file.""" cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes) tabio.write(cnarr, out_fname) return out_fname
def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext(os.path.basename(fasta) )[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', None, access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin( bam_fname, *autobin_args, bp_per_bin=50000.) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target(bait_arr, annotate, short_names, True, **({'avg_size': target_avg_size} if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
def genes_in_segments(segarr, gene_info): return gene_info.into_ranges(segarr, 'gene', '-', join_unique) if __name__ == '__main__': AP = argparse.ArgumentParser(description=__doc__) AP.add_argument('seg_files', nargs='+', help="Segmented aCGH data in SEG format.") AP.add_argument('-g', '--gene-resource', metavar="FILE", required=True, # default="data/ensembl-gene-info.hg38.tsv", help="Ensembl BioMart-derived gene info table.") AP.add_argument('-d', '--output-dir', metavar='PATH', default='.', help="Output directory.") args = AP.parse_args() gene_info = load_gene_info(args.gene_resource, None, None) bad_genes = ['Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'uc_338', 'Clostridiales-1'] gene_info = gene_info[~gene_info['gene'] .isin(bad_genes)] gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')]) for seg_fname in args.seg_files: seg = tabio.read(seg_fname, 'seg') # Assign gene names to segments using genomic coordinates from gene_info seg['gene'] = genes_in_segments(seg, gene_info) outfname = os.path.join(args.output_dir, basename(seg_fname) + ".acgh.cns") tabio.write(seg, outfname, 'tab') print("Wrote", outfname)
from cnvlib.params import NULL_LOG2_COVERAGE from skgenome import tabio READ_LENGTH = 150 # Not super important def parse_coords(coords): chrom, rest = coords.split(':', 1) start, end = rest.split('-') return chrom, int(start) - 1, int(end) table = pd.read_table(sys.argv[1]) chroms, starts, ends = zip(*table['Name'].apply(parse_coords)) depths = READ_LENGTH * table['NumReads'] / table['Length'] norm_depth = table['TPM'] / table['TPM'][depths > 0].median() log2_ratios = safe_log2(norm_depth, NULL_LOG2_COVERAGE) weights = table['EffectiveLength'] / table['EffectiveLength'].max() cnarr = CNA.from_columns({ 'chromosome': chroms, 'start': starts, # np.array(starts) - 1, 'end': ends, 'gene': '-', 'log2': log2_ratios, 'depth': depths, 'weight': weights, }) cnarr.sort() tabio.write(cnarr, sys.stdout)
import logging from skgenome import tabio logging.basicConfig(level=logging.INFO, format="%(message)s") AP = argparse.ArgumentParser(description=__doc__) AP.add_argument('refflat', help="UCSC refFlat.txt for the reference genome.") AP.add_argument('-e', '--exons', action='store_true', help="""Emit each exon instead of the whole gene regions.""") AP.add_argument('-f', '--flatten', action='store_true', help="""Flatten overlapping regions, keeping original boundaries. Not recommended with --exons.""") AP.add_argument('-m', '--merge', metavar='BASEPAIRS', nargs='?', type=int, const=1, help="""Merge overlapping regions with different names. Recommended with --exons. Optional argument value is the number of overlapping bases between two regions to trigger a merge. [Default: %(const)s]""") AP.add_argument('-o', '--output', help="Output filename.") args = AP.parse_args() regions = tabio.read(args.refflat, 'refflat', exons=args.exons) if args.flatten: regions = regions.flatten() elif args.merge: regions = regions.merge(bp=args.merge) tabio.write(regions, args.output, 'bed4')
def create_master_report(self, time_point, normal_time_point, report_file): master_metadata = [] sample = self.patientid + "_" + time_point normal_sample = self.patientid + "_" + normal_time_point self.annodf = get_sample_info(self.patient_dir) self.annodf['gender'] = self.annodf.gender.replace('W', 'female').replace( 'M', 'male') if sample in corr_purity: logger.info("correcting purity from {} to {}".format( annodf.loc[sample].purity, corr_purity[sample])) self.annodf.set_value(sample, 'purity', corr_purity[sample]) cnr_filename = join(self.out_folder, "{}.cnr".format(sample)) logger.debug("Getting log2 ratio df for sample {}".format(sample)) cnvkit_cnr = self.get_log2_ratio_df(sample) cnvkit_cns = self.get_segments_df(sample, cnvkit_cnr) cnvkit_vaf = self.get_cnvkit_vaf(sample, normal_sample) logger.debug('pipeline provance set to (not_bcbio): {}'.format( self.not_bcbio)) if self.not_bcbio: try: gender = self.annodf.loc[sample.replace('CR', 'REL').replace( 'REL2', 'REL')].gender except KeyError: gender = None try: purity = self.annodf.loc[sample].purity logger.info('purity: {}'.format(purity)) except KeyError: purity = 100 logging.info({"gender": gender, "purity": purity}) calling_method = 'clonal' if purity > 90 else 'threshold' cnvkit_called = cnvkit_call(cnvkit_cns, variants=cnvkit_vaf, is_sample_female=gender == 'female', is_reference_male=gender == 'male', purity=purity / 100, method=calling_method) if time_point != normal_time_point: breaks = pd.DataFrame(cnvkit_breaks( cnvkit_cnr, cnvkit_called)).fillna("").replace('nan', '') breaks.columns = [ 'Gene', 'Chrom.', 'Location', 'Change', 'ProbesLeft', 'ProbesRight' ] #print(breaks.query("Gene != ''")) gainloss = pd.DataFrame( cnvkit_gainloss(cnvkit_cnr, segments=cnvkit_called, male_reference=gender == 'male')) #gainloss = pd.DataFrame(cnvkit_gainloss(cnvkit_cnr,min_probes=1, male_reference=gender == 'male')) print(cnr_filename.replace('.cnr', '.gainloss')) gainloss.to_csv(cnr_filename.replace('.cnr', '.gainloss'), sep="\t", index=None) else: cnvkit_called = pd.read_table( get_log2_ratio_file(self.patient_dir, self.not_bcbio, sample).replace('.cnr', '-call.cns'), dtype={ 'chromosome': 'str' }).loc[lambda df: df.chromosome.str.startswith('GL') == False] cnvkit_called = CopyNumArray(cnvkit_called) #cnvkit_cnr.write(cnr_filename) tabio.write(cnvkit_cnr, cnr_filename) #cnvkit_called.write(cnr_filename.replace(".cnr", ".called.cns")) tabio.write(cnvkit_called, cnr_filename.replace(".cnr", ".called.cns")) metadata_instance = "called" logger.info(metadata_instance) master_metadata.append(metadata_instance) do_plots = True if do_plots: pylab.rcParams['figure.figsize'] = (25, 8) cnvkit_scatterplot(cnarr=cnvkit_cnr, segments=cnvkit_called, variants=cnvkit_vaf, do_trend=True, title=sample) savefig(join(self.out_folder, '{}.karyotype.png'.format(sample))) pylab.clf() #write_df(cnvkit_called.reset_index(), report_file, metadata=master_metadata, index=False) return cnvkit_called.data
AP_access.add_argument('-l', '--min-length', metavar='TARGET_SIZE', type=int, default=50, help="""Minimum region length to accept as captured. [Default: %(default)s]""") args = AP.parse_args() # ENH: can we reserve multiple cores for htslib? if args.processes < 1: args.processes = None if args.targets: baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta) else: baits = scan_targets( args.access, args.sample_bams, 0.5 * args.min_depth, # More sensitive 1st pass args.min_gap, args.min_length, args.processes) baits = normalize_depth_log2_filter(baits, args.min_depth) tabio.write(baits, args.output or sys.stdout, 'bed') if args.coverage: baits['log2'] = np.log2(baits['depth'] / baits['depth'].median()) tabio.write(baits, args.coverage, 'tab')
"""Extract target and antitarget BED files from a CNVkit reference file. Once you have a stable CNVkit reference for your platform, you can use this script to drop the "bad" bins from your target and antitarget BED files and avoid unnecessarily calculating coverage in those bins during future runs. This script is also useful to recover the target and antitarget BED files that match the reference if those BED files are missing or you're not sure which ones are correct. """ import argparse import logging import cnvlib from cnvlib import reference from skgenome import tabio logging.basicConfig(level=logging.INFO, format="%(message)s") AP = argparse.ArgumentParser(description=__doc__) AP.add_argument("reference", help="Reference file.") AP.add_argument("-o", "--output", help="Output base name (extensions added automatically).") args = AP.parse_args() ref = cnvlib.read(args.reference) targets, antitargets = reference.reference2regions(ref) name = args.output or ref.sample_id tabio.write(targets, name + '.target.bed', 'bed4') tabio.write(antitargets, name + '.antitarget.bed', 'bed4')
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rscript_path, by_count, skip_low, seq_method, segment_method, processes, do_cluster): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) sample_id = core.fbase(bam_fname) sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, processes) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, processes) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True, do_cluster=do_cluster) tabio.write(cnarr, sample_pfx + '.cnr') logging.info("Segmenting %s.cnr ...", sample_pfx) segments = segmentation.do_segmentation(cnarr, segment_method, rscript_path=rscript_path, skip_low=skip_low, processes=processes, **({'threshold': 1e-6} if seq_method == 'wgs' else {})) logging.info("Post-processing %s.cns ...", sample_pfx) # TODO/ENH take centering shift & apply to .cnr for use in segmetrics seg_metrics = segmetrics.do_segmetrics(cnarr, segments, interval_stats=['ci'], alpha=0.5, smoothed=True) tabio.write(seg_metrics, sample_pfx + '.cns') # Remove likely false-positive breakpoints seg_call = call.do_call(seg_metrics, method="none", filters=['ci']) # Calculate another segment-level test p-value seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest']) # Finally, assign absolute copy number values to each segment seg_alltest.center_all("median") seg_final = call.do_call(seg_alltest, method="threshold") tabio.write(seg_final, sample_pfx + '.call.cns') # Test for single-bin CNVs separately seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True) tabio.write(seg_bintest, sample_pfx + '.bintest.cns') if plot_scatter: scatter.do_scatter(cnarr, seg_final) pyplot.savefig(sample_pfx + '-scatter.png', format='png', bbox_inches="tight") logging.info("Wrote %s-scatter.png", sample_pfx) if plot_diagram: is_xx = cnarr.guess_xx(male_reference) outfname = sample_pfx + '-diagram.pdf' diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx), seg_final.shift_xx(male_reference, is_xx), 0.5, 3, outfname) logging.info("Wrote %s", outfname)