def _from_intervals(coords): garr = GA( pd.DataFrame(list(coords), columns=['start', 'end', 'gene']).assign(chromosome='chr0')) garr.sort_columns() return garr
def reference2regions(refarr): """Split reference into target and antitarget regions.""" is_bg = (refarr['gene'].isin(params.ANTITARGET_ALIASES)) regions = GA(refarr.data.loc[:, ('chromosome', 'start', 'end', 'gene')], {'sample_id': 'reference'}) targets = regions[~is_bg] antitargets = regions[is_bg] return targets, antitargets
def reference2regions(refarr): """Split reference into target and antitarget regions.""" is_bg = (refarr['gene'] == 'Background') regions = GA(refarr.data.loc[:, ('chromosome', 'start', 'end', 'gene')], {'sample_id': 'reference'}) targets = regions[~is_bg] antitargets = regions[is_bg] return targets, antitargets
def scan_targets(access_bed, sample_bams, min_depth, min_gap, min_length, procs): """Estimate baited regions from a genome-wide, per-base depth profile.""" bait_chunks = [] # ENH: context manager to call rm on bed chunks? with to_chunks as pool, ck? logging.info("Scanning for enriched regions in:\n %s", '\n '.join(sample_bams)) # with futures.ProcessPoolExecutor(procs) as pool: with parallel.pick_pool(procs) as pool: args_iter = ((bed_chunk, sample_bams, min_depth, min_gap, min_length) for bed_chunk in parallel.to_chunks(access_bed)) for bed_chunk_fname, bait_chunk in pool.map(_scan_depth, args_iter): bait_chunks.append(bait_chunk) parallel.rm(bed_chunk_fname) baits = GA(pd.concat(bait_chunks)) baits['depth'] /= len(sample_bams) return baits
def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext( os.path.basename(fasta))[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin(bam_fname, *autobin_args, bp_per_bin=50000.) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target( bait_arr, annotate, short_names, True, **({ 'avg_size': target_avg_size } if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
def idxstats2ga(table, bam_fname): return GA(table.assign(start=0, end=table.length) .loc[:, ('chromosome', 'start', 'end')], meta_dict={'filename': bam_fname})
def idxstats2ga(table): return GA( table.assign(start=0, end=table.length).loc[:, ('chromosome', 'start', 'end')])
'-g', '--gene-resource', metavar="FILE", required=True, # default="data/ensembl-gene-info.hg38.tsv", help="Ensembl BioMart-derived gene info table.") AP.add_argument('-d', '--output-dir', metavar='PATH', default='.', help="Output directory.") args = AP.parse_args() gene_info = load_gene_info(args.gene_resource, None, None) bad_genes = [ 'Metazoa_SRP', '5S_rRNA', 'Y_RNA', 'U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'uc_338', 'Clostridiales-1' ] gene_info = gene_info[~gene_info['gene'].isin(bad_genes)] gene_info = GA(gene_info.loc[:, ('chromosome', 'start', 'end', 'gene')]) for seg_fname in args.seg_files: seg = tabio.read(seg_fname, 'seg') # Assign gene names to segments using genomic coordinates from gene_info seg['gene'] = genes_in_segments(seg, gene_info) outfname = os.path.join(args.output_dir, basename(seg_fname) + ".acgh.cns") tabio.write(seg, outfname, 'tab') print("Wrote", outfname)