def __init__(self, in_bam_filename, bc_freq, bc_map, read_freq = None, regions_file = None, extend = 0): """ in_bam_filename: BAM file bc_freq: numpy array with barcode frequencies bc_map: dict from barcodes to indices (in bc_freq and read_freq) read_freq: numpy array with read frequencies regions_file: BED file with target regions (or None). If provided, only reads from these regions will be counted (after extending these regions by "extend") extend: How much to extend target regions. Should be a non-negative integer. """ self.bam = in_bam_filename self.bc_freq = bc_freq self.bc_map = bc_map self.read_freq = read_freq if regions_file is None: self.regions = None else: self.regions = {} extend = max(extend, 0) bed_iterator = tk_io.get_bed_iterator(regions_file) for chrom, start, stop in bed_iterator: start = max(0, start - extend) stop = stop + extend if not chrom in self.regions: self.regions[chrom] = [] self.regions[chrom].append((start, stop)) for chrom, region_list in self.regions.iteritems(): self.regions[chrom] = tk_regions.Regions(self.regions[chrom])
def main(args, outs): vc_mode, variant_caller, precalled_file, gatk_path = tk_io.get_vc_mode( args.vc_precalled, args.variant_mode) locus = args.locus (chrom, start, stop) = tk_io.get_locus_info(locus) fasta_path = tk_reference.get_fasta(args.reference_path) bedfile = outs.default + ".bed" regions = Regions() if args.targets_file is not None: for (chrom, start, end) in tk_io.get_bed_iterator(args.targets_file, args.locus): regions.add_region((start, end)) else: (chrom, start, stop) = tk_io.get_locus_info(args.locus) regions.add_region((start, stop)) coverage_regions = None if (vc_mode != "precalled") and args.high_coverage_excluded_bed is not None: coverage_regions = get_coverage_regions(args) regions = regions.intersect(coverage_regions) bed_length = 0 with open(bedfile, 'w') as bed_writer: for region in regions.get_region_list(): (start, end) = region bed_writer.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n") bed_length += 1 if vc_mode == "precalled" or vc_mode == "precalled_plus": outs.default = None precalled_vars_path = args.split_input vcf = tk_io.VariantFileReader(precalled_vars_path) with open(outs.precalled, "w") as file_write: output = tk_io.VariantFileWriter( file_write, template_file=open(precalled_vars_path)) variant_iter = tk_io.get_variant_iterator_pos( vcf, bedfile, args.locus) for record in variant_iter: output.write_record(record) if not (vc_mode == "precalled"): outs.precalled = None primary_contigs = tk_reference.load_primary_contigs( args.reference_path) if bed_length > 0 and chrom in primary_contigs: vc.run_variant_caller(variant_caller, gatk_path, args.__mem_gb, fasta_path, args.input, outs.default, bedfile)
def mask_profile(profile, bin_size, mask_bed_file, cutoff=0, keep_gt_cutoff=True): # Auxiliary function def mask_bin(masked_profile, chrom, start, end, bin_size): #bin_size = end - start + 1 bin_index = int(math.floor(start / bin_size)) try: masked_profile.get(chrom)[bin_index] = float("NaN") except Exception as e: #print(traceback.format_exc()) print(chrom + ":" + str(start) + "-" + str(end) + " not found") print(e) # try except # mask_bin # masked_profile = copy.deepcopy(profile) # TODO: Implement # TODO: Test - the input dictionaries profile and mask must have identical keys # TODO: Test - for each key (chrName), the arrays profile[chrName] and mask[chrName] # must have identical lengths mask_iterator = get_bed_iterator(bed_file_name=mask_bed_file, locus=None) for (chrom, start, end, value) in mask_iterator: #print("%s\t%d\t%d\t%f\n" % (chrom, start, end, value)) if value > cutoff: if keep_gt_cutoff: pass # keep bins above cutoff else: # Mask bins above cutoff mask_bin(masked_profile, chrom, start, end, bin_size) # if keep_gt_cutoff else: if keep_gt_cutoff: # Mask bins below cutoff mask_bin(masked_profile, chrom, start, end, bin_size) else: pass # keep bins below cutoff # if keep_gt_cutoff # if value else # for chrom return masked_profile
def main(args, outs): args.coerce_strings() outs.coerce_strings() if args.coverage is None or args.bait_file is None: outs.bait_csv = None return f = h5py.File(args.coverage) has_basic = ('coverage_deduped' in f) and ('mapq30_coverage_deduped' in f) has_subsampling = ('coverage_subsampled' in f) and ('coverage_deduped_subsampled' in f) f.close() if not has_basic: return fasta = tenkit.reference.open_reference(args.reference_path) df = p.DataFrame() coverage_reader = tenkit.hdf5.DataFrameReader(args.coverage) #regs = tenkit.bio_io.get_target_regions_dict(args.bait_file) #for chrom in regs: # for start, end in regs[chrom]: bedIte = tk_io.get_bed_iterator(args.bait_file) for chrom, start, end in bedIte: if has_subsampling: coverage = coverage_reader.query([chrom, start, end], query_cols=[ 'coverage_deduped', 'coverage_deduped_subsampled', 'coverage_subsampled', 'mapq30_coverage_deduped' ], coords=False) mean_cov = coverage.mean() gc = get_gc(chrom, (start, end), fasta) #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True) df = df.append( { 'chrom': chrom, 'start': start, 'end': end, 'tag': args.tag, 'coverage_deduped': mean_cov['coverage_deduped'], 'coverage_deduped_subsampled': mean_cov['coverage_deduped_subsampled'], 'coverage_subsampled': mean_cov['coverage_subsampled'], 'mapq30_coverage_deduped': mean_cov['mapq30_coverage_deduped'], 'gc': gc }, ignore_index=True) else: coverage = coverage_reader.query( [chrom, start, end], query_cols=['coverage_deduped', 'mapq30_coverage_deduped'], coords=False) mean_cov = coverage.mean() gc = get_gc(chrom, (start, end), fasta) #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True) df = df.append( { 'chrom': chrom, 'start': start, 'end': end, 'tag': args.tag, 'coverage_deduped': mean_cov['coverage_deduped'], 'mapq30_coverage_deduped': mean_cov['mapq30_coverage_deduped'], 'gc': gc }, ignore_index=True) df.to_csv(outs.bait_csv)
def split(args): win = args.window_size input_bam = tk_bam.create_bam_infile(args.input) chroms = input_bam.references chrom_lengths = input_bam.lengths chrom_len_map = {} for i, chrom in enumerate(chroms): chrom_len_map[chrom] = chrom_lengths[i] input_bam.close() max_mem_in_gb = 4 # Be a little conservative chunk_size = get_max_chunk(win, max_mem_in_gb) if not args.restrict_locus is None: locus_chrom, locus_start, locus_stop = tk_io.get_locus_info( args.restrict_locus) assert (locus_chrom in chrom_len_map) locus_start = max(0, locus_start) locus_stop = min(locus_stop, chrom_len_map[locus_chrom]) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) genome_size = np.sum([ length for chrom, length in chrom_len_map.iteritems() if chrom in primary_contigs ]) prev_chrom = '' tot_bp = 0 starts = [] stops = [] chunks = [] # Genome-wide windows if not args.restrict_locus is None: chunks.append({ 'chrom': locus_chrom, 'starts': [locus_start], 'stops': [locus_stop], '__mem_gb': 8 }) else: for chrom, length in chrom_len_map.iteritems(): if not args.sex is None and args.sex.lower() in [ 'f', 'female' ] and chrom in ['Y', 'chrY']: continue if not chrom in primary_contigs: continue nchunks = int(np.ceil(length / float(chunk_size))) # Divide as evenly as possible the windows across the chunks # This also makes sure that all chunks except the last will # have sizes that are multiples of the window size. win_per_chunk = int(np.ceil(length / float(nchunks * win))) new_chunk_size = win_per_chunk * win for c in range(nchunks): chunk_start = c * new_chunk_size chunk_stop = min((c + 1) * new_chunk_size, length) chunks.append({ 'chrom': chrom, 'starts': [chunk_start], 'stops': [chunk_stop], '__mem_gb': 8 }) # Target-centered windows. If the targets (plus the extent) cover too much of the # genome, then skip these. if not args.targets is None and not args.target_extend is None: target_regions = [] bed_iterator = tk_io.get_bed_iterator(args.targets) for chrom, start, stop in bed_iterator: if not args.sex is None and args.sex.lower() in [ 'f', 'female' ] and chrom in ['Y', 'chrY']: continue if not chrom in primary_contigs: continue stop = min(chrom_len_map[chrom], stop) if args.restrict_locus is None or ( chrom == locus_chrom and overlaps(start, stop, locus_start, locus_stop)): target_regions.append((chrom, start, stop)) target_regions = sort_and_merge(target_regions, args.target_extend) target_size = np.sum( [stop - start for _, start, stop in target_regions]) if target_size / float(genome_size) < MIN_TARGET_FRAC: for (chrom, start, stop) in target_regions: if (prev_chrom != chrom and prev_chrom != '') or tot_bp > 1 * 1e7: chunks.append({ 'chrom': str(prev_chrom), 'starts': starts, 'stops': stops, '__mem_gb': 8 }) starts = [] stops = [] tot_bp = 0 tot_bp += (stop - start) prev_chrom = chrom starts.append(start) stops.append(stop) if prev_chrom != '': chunks.append({ 'chrom': str(prev_chrom), 'starts': starts, 'stops': stops, '__mem_gb': 8 }) return {'chunks': chunks}