def main(args, outs): vc_mode, variant_caller, precalled_file, gatk_path = tk_io.get_vc_mode( args.vc_precalled, args.variant_mode) locus = args.locus (chrom, start, stop) = tk_io.get_locus_info(locus) fasta_path = tk_reference.get_fasta(args.reference_path) bedfile = outs.default + ".bed" regions = Regions() if args.targets_file is not None: for (chrom, start, end) in tk_io.get_bed_iterator(args.targets_file, args.locus): regions.add_region((start, end)) else: (chrom, start, stop) = tk_io.get_locus_info(args.locus) regions.add_region((start, stop)) coverage_regions = None if (vc_mode != "precalled") and args.high_coverage_excluded_bed is not None: coverage_regions = get_coverage_regions(args) regions = regions.intersect(coverage_regions) bed_length = 0 with open(bedfile, 'w') as bed_writer: for region in regions.get_region_list(): (start, end) = region bed_writer.write(chrom + "\t" + str(start) + "\t" + str(end) + "\n") bed_length += 1 if vc_mode == "precalled" or vc_mode == "precalled_plus": outs.default = None precalled_vars_path = args.split_input vcf = tk_io.VariantFileReader(precalled_vars_path) with open(outs.precalled, "w") as file_write: output = tk_io.VariantFileWriter( file_write, template_file=open(precalled_vars_path)) variant_iter = tk_io.get_variant_iterator_pos( vcf, bedfile, args.locus) for record in variant_iter: output.write_record(record) if not (vc_mode == "precalled"): outs.precalled = None primary_contigs = tk_reference.load_primary_contigs( args.reference_path) if bed_length > 0 and chrom in primary_contigs: vc.run_variant_caller(variant_caller, gatk_path, args.__mem_gb, fasta_path, args.input, outs.default, bedfile)
def call_haploid(haplotype, bam, locus, reference_path, variant_caller, gatk_path, mem_gb): bam_name = "hap" + str(haplotype) + ".bam" haploid_bam, _ = tenkit.bam.create_bam_outfile(bam_name, None, None, template=bam) (chrom, start, stop) = tk_io.get_locus_info(locus) for read in bam.fetch(chrom, start, stop): readhap = dict(read.tags).get('HP') if readhap != None and int(readhap) == haplotype: haploid_bam.write(read) haploid_bam.close() tk_bam.index(bam_name) tmp_vcf_name = "tmp_hap" + str(haplotype) + ".vcf" vcf_name = "hap" + str(haplotype) + ".vcf" fasta_path = tk_ref.get_fasta(reference_path) vc.run_variant_caller(variant_caller, gatk_path, mem_gb, fasta_path, bam_name, tmp_vcf_name, haploid_mode=True) longranger.variants.canonicalize(tmp_vcf_name, vcf_name) tenkit.tabix.index_vcf(vcf_name) bam_in = tk_bam.create_bam_infile(bam_name) return (vcf_name + ".gz", bam_in)
def main(args, outs): if args.locus is None: outs.spikes = None return if args.mean is None or (not args.mean): outs.spikes = None return (chrom, start, stop) = tk_io.get_locus_info(args.locus) chrom = str(chrom) cov = tenkit.hdf5.read_data_frame_indexed(args.coverage, [(chrom, start, stop)], query_cols=['pos', 'coverage']) #cov.coverage = tk_stats.robust_divide(cov.coverage, args.mean) cov.coverage /= max( 1.0, args.mean ) # args.mean should not be zero unless due to other issue, like missing args.cov_hist cov = cov[cov.coverage > 10] spikes_pos = cov["pos"].values breaks = list(np.where(np.diff(spikes_pos) != 1)[0] + 1) if len(breaks) > 0: starts = [spikes_pos[b] for b in [0] + breaks] ends = [spikes_pos[b - 1] + 1 for b in breaks + [len(spikes_pos)]] else: starts = [] ends = [] with open(outs.spikes, "w") as fout: for s, e in zip(starts, ends): fout.write(chrom + "\t" + str(s) + "\t" + str(e) + "\n")
def main(args, outs): args.coerce_strings() outs.coerce_strings() if args.confident_regions is None: confident_regions = None else: confident_regions = tk_io.get_target_regions( open(args.confident_regions)) outfile = open(outs.confident_windows, "w") for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): conf_regions = get_conf_regions(chrom, confident_regions) location = start while location < end: region = tk_regions.Regions(regions=[(location, location + args.window_size)]) isect = region.intersect(conf_regions) size = isect.get_total_size() percent = tk_stats.robust_divide(float(size), float(args.window_size)) row = [chrom, location, location + args.window_size, percent] outfile.write("\t".join(map(str, row)) + "\n") location += args.window_size outfile.close()
def main(args, outs): args.coerce_strings() outs.coerce_strings() input_vfr = tk_io.VariantFileReader(args.input) bc_mix_prob = args.bc_mix_prob min_var_hap_conf = args.min_var_hap_conf min_junction_hap_conf = args.min_junction_hap_conf hap_block_size = args.hap_block_size hap_block_buffer_size = args.hap_block_buffer_size max_reassign_rounds = args.max_reassign_rounds chrom, start, stop = tk_io.get_locus_info(args.locus) output_file = open(outs.default.strip('.gz'), 'w') fragment_output_file = open(outs.fragment_phasing.strip('.gz'), 'w') vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode) # Add the component name and the version of the phasing code new_source = "10X/pipelines/stages/snpindels/phase_snpindels %s" % martian.get_pipelines_version( ) new_filters = [ ("10X_PHASING_INCONSISTENT", "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with phasing." ), ("10X_HOMOPOLYMER_UNPHASED_INSERTION", "Unphased insertions in homopolymer regions tend to be false positives" ) ] new_formats = [ ("PS", 1, "Integer", "ID of Phase Set for Variant"), ("PQ", 1, "Integer", "Phred QV indicating probability at this variant is incorrectly phased" ), ("JQ", 1, "Integer", "Phred QV indicating probability of a phasing switch error in gap prior to this variant" ), ] vfw = tk_io.VariantFileWriter(output_file, template_file=open(args.input), new_source=new_source, new_format_fields=new_formats, new_filters=new_filters) if args.do_phasing: phaser = Phaser(input_vfr, args.fragments, chrom, start, stop, bc_mix_prob, min_junction_hap_conf, min_var_hap_conf, hap_block_buffer_size, hap_block_size, max_reassign_rounds, vc_mode) phaser.call_haps(vfw, fragment_output_file) else: pass_variants(input_vfr, vfw, chrom, start, stop, strip_phasing_info=True) output_file.close() fragment_output_file.close() tk_tabix.sort_unique_tabix_vcf(outs.default.strip('.gz'))
def get_coverage_regions(args): (chrom, start, stop) = tk_io.get_locus_info(args.locus) regions = Regions( tk_io.get_target_regions(open( args.high_coverage_excluded_bed)).get(chrom)) if regions == None: regions = Regions() return regions
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.possorted_bam) for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov_df = get_hap_coverage(in_bam, args.phase_set_h5, chrom, start, stop, cov_quals=COV_QUALS) tk_hdf5.append_data_frame(outs.hap_coverage, cov_df) in_bam.close()
def split(args): bam_in = tk_bam.create_bam_infile(args.input) if args.restrict_locus is None: chunk_defs = [{'chrom': chrom} for chrom in bam_in.references] else: chrom, start, stop = tk_io.get_locus_info(args.restrict_locus) chunk_defs = [{'chrom': chrom}] return {'chunks': chunk_defs}
def main(args, outs): genome_fasta_path = cr_utils.get_reference_genome_fasta(args.reference_path) chrom, start, stop = tk_io.get_locus_info(args.locus) bed_path = martian.make_path('region.bed') with open(bed_path, 'w') as f: f.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\n") freebayes_args = ['freebayes', '-f', genome_fasta_path, '-b', args.input, '-0', '-t', bed_path] with open(outs.output, 'w') as f: subprocess.check_call(freebayes_args, stdout=f)
def main(args, outs): args.coerce_strings() outs.coerce_strings() if not args.fragments: outs.rPM = None outs.rPMFiles = None outs.stat = None outs.fracPhased = None outs.covStat = None return tool = "molecular_count" cmd0 = tool if args.wgsmode: cmd0 += " --wgs" tmp_bed_file = outs.stat + "_tmp.bed" (chrom, start, end) = tk_io.get_locus_info(args.locus) nrows = 0 with open(tmp_bed_file, "w") as fout: with open(args.queryRegions) as f: for l in f: items = l.split() reg_chrom = items[0] reg_start = int(items[1]) reg_end = int(items[2]) if chrom == reg_chrom and reg_start >= start and reg_end < end: nrows += 1 fout.write(l) if nrows == 0: outs.rPM = None return # specify the target file cmd0 += " -b " + tmp_bed_file cmd = cmd0 + \ " -f " + args.fragments + \ " -p " + args.fragment_phasing + \ " -m " + args.phased_possorted_bam + \ " --rPM " + outs.rPM + " --stat " + outs.stat + \ " --fracPhased " + outs.fracPhased + " --mapq "+str(args.mapq) + \ " --overlap " + str(args.overlap) + " --covstat " + str(outs.covStat) print "Running cmd:" print cmd subprocess.check_call(cmd, shell=True)
def main(args, outs): """For each slice produce a fasta file sampling reads from that slice. We split our section of the genome into a bunch of 20kb chunks. For each chunk we sample an identical number of paired end reads. The name of each read encodes the true position that it was sampled from.""" # Grab basic stats for the read lengths and quality scores stats_fp = open(args.basic_stats) stats = json.load(stats_fp) # Fix the random seed np.random.seed(0) # Info is a map we use everywhere to track the sampling parameters. # r1_len: the length of read1 # r2_len: the length of read2 # insert_size_map: a map of insert-size (as a string) to frequency # q_score_map a map of quality score (as a string) to frequency info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']} info['q_score_map'] = { '30': stats['bc_q30_bases'], '20': stats['bc_q20_bases'] - stats['bc_q30_bases'], '0': stats['bc_tot_bases'] - stats['bc_q20_bases'] } stats_is_fp = open(args.insert_sizes) info['insert_size_map'] = json.load(stats_is_fp)['60'] # How many samples will we make from each window? samples = int( round(2.0 * args.target_coverage * (float(args.window_size) / (stats['r1_len'] + stats['r2_len'])))) martian.log_info("Using %i samples per %i bin" % (samples, args.window_size)) output_path = martian.make_path("chnk.fasta") output = open(output_path, "w") ref = reference.open_reference(args.reference_path) #Loop over every window in every loci. for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci): cur = start while (cur < end): # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put # the results in the output file perbin(chrom, cur, ref, output, info, args.window_size, samples) cur += args.window_size outs.tmp = output_path outs.samples_per_bin = samples output.close()
def pack_loci(loci): packed_loci = [] current_group = [] current_size = 0 for locus in loci: (chrom, start, end) = tk_io.get_locus_info(locus) current_group.append(locus) current_size += end - start if current_size >= 0.25 * PARALLEL_LOCUS_SIZE: packed_loci.append(current_group) current_group = [] current_size = 0 if len(current_group) > 0: packed_loci.append(current_group) return packed_loci
def validate_loci(bam, loci, whitelist): loci = sorted([tk_io.get_locus_info(l) for l in loci]) good_chroms = [] for (chrom, items) in groupby(loci, lambda x: x[0]): sorted_items = sorted(items, key=lambda x:x[1]) last_end = 0 for (_,s,e) in sorted_items: assert(e-s > 0) assert(s == last_end) last_end = e assert(last_end == chrom_size(bam, chrom)) good_chroms.append(chrom) if whitelist == None: assert(set(good_chroms) == set(bam.references)) else: assert(set(good_chroms) == set(whitelist))
def merge_haploid(novel_vcf, putative_vcf, locus, output_filename, bam, reference_pyfasta, args, add_gl = True): novel = tenkit.bio_io.VariantFileReader(novel_vcf) putative_variants = tenkit.bio_io.VariantFileReader(putative_vcf) (chrom, start, stop) = tk_io.get_locus_info(locus) with open(output_filename,'w') as output: output_vcf = tenkit.bio_io.VariantFileWriter(output, template_file = open(putative_vcf)) for (novel, putative) in pair_iter(novel.record_getter(), putative_variants.record_getter(fetch_chrom=chrom, fetch_start = start, fetch_end=stop)): if putative is not None and (tk_io.get_record_passes_filters(putative) or tk_io.get_record_filters(putative) != ['10X_QUAL_FILTER']): putative.INFO['HAPLOCALLED'] = 0 output_vcf.write_record(putative) elif novel is not None: tk_io.set_record_phase_set(novel, get_phase_set(novel, bam)) tk_io.set_record_phase_qual(novel, 25) tk_io.set_record_junction_qual(novel, 25) populate_fields(novel, bam, reference_pyfasta, args) novel.INFO['HAPLOCALLED'] = 1 if add_gl: tk_io.set_record_genotype_likelihoods(novel, calculate_psuedo_genotype_likelihoods(novel)) if novel.QUAL is not None: # freebayes ploidy 1 gives some variants with '.' as the qual. These are extremely low quality not worth even tracking output_vcf.write_record(novel) else: putative.INFO['HAPLOCALLED'] = 0 output_vcf.write_record(putative)
def join(args, outs, chunk_defs, chunk_outs): args.coerce_strings() outs.coerce_strings() frag_files = [c.fragment_phasing.strip('.gz') for c in chunk_outs] vcf_files = [c.default.strip('.gz') for c in chunk_outs] loci = [tk_io.get_locus_info(c.locus) for c in chunk_defs] out_frag_base = outs.fragment_phasing.strip('.gz') out_vcf_base = outs.default.strip('.gz') # stitch phase blocks (stitched_chrom_vcfs, stitched_chrom_frags) = stitcher.multi_join_parallel( frag_files, vcf_files, loci, args.__threads) # combine the chromosome-level outputs combine_frags(out_frag_base, stitched_chrom_frags) tk_io.combine_vcfs(out_vcf_base, stitched_chrom_vcfs) if args.vc_precalled is not None: outs.vc_precalled = outs.default else: outs.vc_precalled = None # final indexing pysam.tabix_index(out_frag_base, seq_col=0, start_col=1, end_col=2)
def main(args, outs): reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] ext_cols = list(sel_cols) ext_cols.append('total_cov') out_loci = [] summary_df = None for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov = reader.query((chrom, start, stop)) cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int) cov['total_cov'] = cov[sel_cols].sum(axis=1) mean_cov = np.mean(cov['total_cov']) summary_df = pd.concat([ summary_df, pd.DataFrame( { 'chrom': chrom, 'start': start, 'stop': stop, 'mean_cov': mean_cov }, index=[0]) ], ignore_index=True) # Remove very small phase sets. These tend to be single-SNP phase sets # and can result from erroneous SNPs. cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000) sum_df = cov.groupby(['bin', 'phase_set'])[ext_cols].mean().reset_index() sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov sum_df['low_hap0'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1) sum_df['low_hap1'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0) if not sum_df.empty: any_low = np.logical_or( sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0)) bins = np.array(sum_df['bin']) bins = np.concatenate([bins, [np.max(bins) + 1]]) pos = 0 # Get runs of 0s and 1s in any_low for bit, group in groupby(any_low): group_size = len(list(group)) group_start = bins[pos] * args.bin_size group_stop = bins[pos + group_size] * args.bin_size region_len = group_stop - group_start if bit and region_len >= args.min_len: out_loci.append((chrom, max(0, group_start - args.bin_size), group_start + args.bin_size, chrom, max(0, group_stop - args.bin_size), group_stop + args.bin_size)) pos += group_size with open(outs.loci, 'w') as f: cPickle.dump(out_loci, f) summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
def split(args): win = args.window_size input_bam = tk_bam.create_bam_infile(args.input) chroms = input_bam.references chrom_lengths = input_bam.lengths chrom_len_map = {} for i, chrom in enumerate(chroms): chrom_len_map[chrom] = chrom_lengths[i] input_bam.close() max_mem_in_gb = 4 # Be a little conservative chunk_size = get_max_chunk(win, max_mem_in_gb) if not args.restrict_locus is None: locus_chrom, locus_start, locus_stop = tk_io.get_locus_info( args.restrict_locus) assert (locus_chrom in chrom_len_map) locus_start = max(0, locus_start) locus_stop = min(locus_stop, chrom_len_map[locus_chrom]) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) genome_size = np.sum([ length for chrom, length in chrom_len_map.iteritems() if chrom in primary_contigs ]) prev_chrom = '' tot_bp = 0 starts = [] stops = [] chunks = [] # Genome-wide windows if not args.restrict_locus is None: chunks.append({ 'chrom': locus_chrom, 'starts': [locus_start], 'stops': [locus_stop], '__mem_gb': 8 }) else: for chrom, length in chrom_len_map.iteritems(): if not args.sex is None and args.sex.lower() in [ 'f', 'female' ] and chrom in ['Y', 'chrY']: continue if not chrom in primary_contigs: continue nchunks = int(np.ceil(length / float(chunk_size))) # Divide as evenly as possible the windows across the chunks # This also makes sure that all chunks except the last will # have sizes that are multiples of the window size. win_per_chunk = int(np.ceil(length / float(nchunks * win))) new_chunk_size = win_per_chunk * win for c in range(nchunks): chunk_start = c * new_chunk_size chunk_stop = min((c + 1) * new_chunk_size, length) chunks.append({ 'chrom': chrom, 'starts': [chunk_start], 'stops': [chunk_stop], '__mem_gb': 8 }) # Target-centered windows. If the targets (plus the extent) cover too much of the # genome, then skip these. if not args.targets is None and not args.target_extend is None: target_regions = [] bed_iterator = tk_io.get_bed_iterator(args.targets) for chrom, start, stop in bed_iterator: if not args.sex is None and args.sex.lower() in [ 'f', 'female' ] and chrom in ['Y', 'chrY']: continue if not chrom in primary_contigs: continue stop = min(chrom_len_map[chrom], stop) if args.restrict_locus is None or ( chrom == locus_chrom and overlaps(start, stop, locus_start, locus_stop)): target_regions.append((chrom, start, stop)) target_regions = sort_and_merge(target_regions, args.target_extend) target_size = np.sum( [stop - start for _, start, stop in target_regions]) if target_size / float(genome_size) < MIN_TARGET_FRAC: for (chrom, start, stop) in target_regions: if (prev_chrom != chrom and prev_chrom != '') or tot_bp > 1 * 1e7: chunks.append({ 'chrom': str(prev_chrom), 'starts': starts, 'stops': stops, '__mem_gb': 8 }) starts = [] stops = [] tot_bp = 0 tot_bp += (stop - start) prev_chrom = chrom starts.append(start) stops.append(stop) if prev_chrom != '': chunks.append({ 'chrom': str(prev_chrom), 'starts': starts, 'stops': stops, '__mem_gb': 8 }) return {'chunks': chunks}
def main(args, outs): """ Outputs barcode file """ args.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) unsorted_temp_name = martian.make_path(outs.contig_output + '_TEMPUNSORTED') sorted_temp_name = martian.make_path(outs.contig_output + '_TEMPSORTED') base_dir = os.path.dirname(outs.contig_output) unsorted_temp_file = open(unsorted_temp_name, 'w') contig_output_file = open(outs.contig_output, 'w') window_size = args.window_size chroms = bam_in.references # Output the raw poses unsorted_temp_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ']) + '\n') if args.restrict_locus is None: bam_iter = bam_in.fetch(args.chrom) else: restrict_chrom, restrict_start, restrict_stop = tk_io.get_locus_info( args.restrict_locus) assert (args.chrom == restrict_chrom) bam_iter = bam_in.fetch(restrict_chrom, restrict_start, restrict_stop) for read in bam_iter: chrom = chroms[read.tid] start = read.pos end = read.aend if end is None: end = start + len(read.seq) bc = tk_io.get_read_barcode(read) if not (bc is None): unsorted_temp_file.write( '\t'.join([chrom, str(start), str(end), bc]) + '\n') # Sort the poses unsorted_temp_file.close() tk_tabix.sort_bc_loc_tabix(unsorted_temp_name, sorted_temp_name, temp_dir_name=base_dir) # Infer the contig locations # This header is written during join #contig_output_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ', 'NUM_READS']) + '\n') sorted_temp_file = open(sorted_temp_name, 'r') sorted_temp_file.readline() old_bc_seq = None bc_poses = [] for line in sorted_temp_file: (chrom, start, end, bc_seq) = line.strip('\n').split('\t') start = int(start) end = int(end) if not (bc_seq == old_bc_seq): if not (old_bc_seq is None): frags = infer_fragments(bc_poses, window_size) for (frag_chrom, frag_start, frag_end, num_reads) in frags: contig_output_file.write('\t'.join([ frag_chrom, str(frag_start - BUFFER), str(frag_end + BUFFER), old_bc_seq, str(num_reads) ]) + '\n') bc_poses = [] old_bc_seq = bc_seq bc_poses.append((chrom, start, end)) # Output for the last barcode if not (old_bc_seq is None): frags = infer_fragments(bc_poses, window_size) for (frag_chrom, frag_start, frag_end, num_reads) in frags: contig_output_file.write('\t'.join([ frag_chrom, str(frag_start - BUFFER), str(frag_end + BUFFER), old_bc_seq, str(num_reads) ]) + '\n') sorted_temp_file.close() subprocess.check_call(['rm', sorted_temp_name]) subprocess.check_call(['rm', unsorted_temp_name]) contig_output_file.close()
def main(args, outs): (chrom, start, stop) = tk_io.get_locus_info(args.locus) chrom = str(chrom) # further split each chunk into regions of 100kB regionSize = 10000 regionStarts = range(start, stop, regionSize) regionEnds = [x for x in regionStarts[1:]] regionEnds.append(stop) # read the bam file samfile = pysam.Samfile(args.bam_infile, "rb") fouts = [ open(outs.hp0.strip(".gz"), 'w'), open(outs.hp1.strip(".gz"), 'w'), open(outs.hp2.strip(".gz"), 'w') ] for rStart, rEnd in zip(regionStarts, regionEnds): print "\n", rStart, rEnd barcode_reads = {} barcode_hp = {} # initialize the coverage track coverage = n.zeros((3, rEnd - rStart)) for r in samfile.fetch(chrom, rStart, rEnd): if not r.is_proper_pair: continue if r.is_duplicate or not r.is_paired or r.is_qcfail or \ r.is_secondary or r.is_unmapped or r.mapq<20 or r.tid != r.rnext or \ abs(r.pos - r.pnext)>5000: continue tags = dict(r.tags) if not "MI" in tags: continue mid = tags["MI"] hp = 0 if "HP" in tags: hp = tags["HP"] if not mid in barcode_hp: barcode_hp[mid] = hp barcode_reads[mid] = tk_regions.Regions() barcode_reads[mid].add_region((r.pos, r.aend)) # add the unique sequenced mol coverage for bc in barcode_reads.keys(): regions = barcode_reads[bc] hp = barcode_hp[bc] for rgs in regions: rgs_start = max(rStart, rgs[0]) rgs_end = min(rEnd, rgs[1]) coverage[hp][rgs_start - rStart:(rgs_end - rStart)] += 1 for hp in range(3): disc_cov = Disc(coverage[hp]) #disc_cov = [ discretize(x) for x in coverage[hp] ] sel = (disc_cov[:-1] != disc_cov[1:]) print sel.sum() pos = n.arange(len(disc_cov)) boundaries = n.append(0, pos[sel], len(disc_cov)) print disc_cov.size, sel.sum(), boundaries.size #print coverage[hp][:10] #print disc_cov[:min(10,len(disc_cov))] for i in range(len(boundaries) - 1): fouts[hp].write( "%s\t%d\t%d\t%d\n" % (chrom, boundaries[i] + rStart, boundaries[i + 1] + rStart, disc_cov[boundaries[i]])) #disc_cov = Disc(coverage) # discereitzed coverage for hp in range(3): fouts[hp].close()
def split(args): input_bam = tk_bam.create_bam_infile(args.bam_file) if args.sex is None or args.sex.lower() in ['m', 'male']: remove_chroms = ['chrX', 'chrY', 'chrM', 'X', 'Y', 'MT', 'M'] elif args.sex.lower() in ['f', 'female']: remove_chroms = ['chrY', 'chrM', 'Y', 'MT', 'M'] else: martian.throw("Unrecognized sex: %s" % args.sex) primary_contigs = tenkit.reference.load_primary_contigs( args.reference_path) # estimate density of het snps and barcodes primary_contig_lengths = [ (chrom, length) for (chrom, length) in zip(input_bam.references, input_bam.lengths) if chrom in primary_contigs ] (frac_het_snps, bcs_per_het_snp, het_rate) = smooth_sample_bcs_and_het_snps(args.input, primary_contig_lengths) martian.log_info( "Fraction of SNPs that are het: %f, BCs per SNP: %f, Hets per bp: %f" % (frac_het_snps, bcs_per_het_snp, het_rate)) # Set up dynamic chunk sizes to make smaller chunks for highly het organisms het_rate = max(min(0.05, het_rate), 0.0001) parallel_locus_size = min(tenkit.constants.PARALLEL_LOCUS_SIZE, tk_stats.robust_divide(60000, het_rate)) if args.restrict_locus is None: loci = tk_bam.generate_tiling_windows( input_bam, parallel_locus_size, overlap=args.chunk_stitching_overlap) else: loci = [args.restrict_locus] chunks = [] for (idx, locus) in enumerate(loci): (chrom, start, end) = tk_io.get_locus_info(locus) if args.fragments is None or chrom not in primary_contigs or chrom in remove_chroms: mem_gb = 3 martian.log_info("Chunk %d: No phasing, requesting %d GB" % (idx, mem_gb)) chunk = {'locus': locus, '__mem_gb': mem_gb, 'do_phasing': False} else: est_het_snps = round(frac_het_snps * count_records(args.input, chrom, start, end)) est_gb = np.ceil(0.5 + 400.0 / 1e9 * est_het_snps * bcs_per_het_snp) # empirical memory usage min_gb = 4 if np.isnan(est_gb): mem_gb = min_gb else: mem_gb = max(min_gb, int(est_gb)) martian.log_info( "Chunk %d: Estimated %f het SNPs, requesting %f GB" % (idx, est_het_snps, mem_gb)) chunk = {'locus': locus, '__mem_gb': mem_gb, 'do_phasing': True} chunks.append(chunk) return {'chunks': chunks, 'join': {'__mem_gb': 16, '__threads': 4}}
def main(args, outs): if args.barcode_whitelist is None: # write empty dataframe tk_sv_io.write_sv_df_to_bedpe(None, outs.del_candidates) martian.log_info('Data seem un-barcoded. No deletion candidates will be computed.') return in_bam = tk_bam.create_bam_infile(args.possorted_bam) del_loci = [] for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov_df = get_hap_coverage(in_bam, None, chrom, start, stop, cov_quals=[30]) best_path = get_candidate_del_loci(cov_df, transition_prob=args.transition_prob, het_read_prob=args.het_read_prob) # Get regions with good coverage for a het del (not too high, not too low) bad_cov = np.logical_or(cov_df['total_cov'] < MIN_COV, cov_df['total_cov'] > MAX_COV) bad_regions = tk_regions.Regions([ (s,e) for (s,e) in group_bit_arr(bad_cov, start) if e-s > args.min_bad_region]) # Group the states of the HMM and exclude bad regions pos = start out_loci = [] for bit, group in groupby(best_path): group_size = len(list(group)) group_start = pos group_stop = group_start + group_size if bit and group_size >= args.min_del_len and group_size <= args.max_del_len and \ not bad_regions.overlapping_regions(group_start, group_stop): out_loci.append((chrom, group_start, group_stop)) pos += group_size # Get regions that look like hom dels hom_del_loci = group_bit_arr(cov_df['total_cov'] < MIN_COV, start) out_loci.extend([(chrom, s, e) for (s, e) in hom_del_loci]) out_loci = sorted(out_loci) # Now merge deletion candidates that are separated by short non-dels if out_loci: new_out_loci = [] last_locus = out_loci[0] for i, locus in enumerate(out_loci[1:]): if locus[1] - last_locus[2] > MIN_GAP: new_out_loci.append(last_locus) last_locus = locus else: last_locus = (last_locus[0], min(locus[1], last_locus[1]), max(locus[2], last_locus[2])) new_out_loci.append(last_locus) del_loci.extend(new_out_loci) final_loci = [locus for locus in del_loci if locus[2] - locus[1] >= args.min_del_len and locus[2] - locus[1] <= args.max_del_len] info_strs = ['TYPE=DEL' for _ in final_loci] in_bam.close() chroms = [locus[0] for locus in final_loci] starts1 = np.array([locus[1] for locus in final_loci], dtype=np.int) starts2 = np.array([locus[2] for locus in final_loci], dtype=np.int) sv_df = tk_sv_io.create_sv_df(chroms, starts1, starts1 + 1, chroms, starts2, starts2 + 1, np.arange(len(chroms)), 1, info_strs = info_strs) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.del_candidates)
def main(args, outs): (chrom, start, stop) = tk_io.get_locus_info(args.locus) chrom = str(chrom) dtypes = { '#chrom': "str", "frag_start": "int64", "frag_end": "int64", "h0": "float64", "h1": "float64" } frags = pd.read_csv(args.fragment_phasing, sep="\t", compression='gzip',\ usecols=["#chrom","frag_start","frag_end","h0","h1"], dtype=dtypes) frags = frags[(frags["#chrom"] == chrom) & (frags["frag_end"] >= start) & (frags["frag_start"] <= stop)] frags["hp"] = 0 frags.loc[frags["h0"] >= 0.95, 'hp'] = 1 frags.loc[frags["h1"] >= 0.95, 'hp'] = 2 del frags["h0"] del frags["h1"] # further split each chunk into regions of 100kB regionSize = 10000 regionStarts = range(start, stop, regionSize) regionEnds = [x for x in regionStarts[1:]] regionEnds.append(stop) # read the bam file samfile = pysam.Samfile(args.possorted_bam, "rb") fouts = [[ open(outs.hp_read_0.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_read_1.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_read_2.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_read_t.strip(".bw") + ".bedGraph", 'w') ], [ open(outs.hp_bc_0.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_bc_1.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_bc_2.strip(".bw") + ".bedGraph", 'w'), open(outs.hp_bc_t.strip(".bw") + ".bedGraph", 'w') ]] # convert h5 file to filtered csv file def filter_func(df): return (df["chrom"] == chrom) & (df["start_pos"] <= stop) & (df["end_pos"] >= start) # work with small region at a time to avoid large memory for rStart, rEnd in zip(regionStarts, regionEnds): frags2 = frags[(frags["frag_end"] >= rStart) & (frags["frag_start"] <= rEnd)] coverage = [np.zeros((4, rEnd - rStart)), np.zeros((4, rEnd - rStart))] #bc_2_phase = {} print "\n", rStart, rEnd # initialize the coverage track ## read count for r in samfile.fetch(chrom, rStart, rEnd): if not r.is_proper_pair: continue if r.is_duplicate or (not r.is_paired) or r.is_qcfail or \ r.is_secondary or r.is_unmapped or r.mapq<30 or r.tid != r.rnext or \ abs(r.pos - r.pnext)>5000: continue tags = dict(r.tags) if not "MI" in tags: continue hp = 0 if "HP" in tags: hp = tags["HP"] s = max(rStart, r.pos) e = min(rEnd, r.aend) if s >= e: continue #print max(rStart, r.pos), min(rEnd,r.aend) coverage[0][hp][(s - rStart):(e - rStart)] += 1 coverage[0][3][(s - rStart):(e - rStart)] += 1 #bc = tags["BX"] #if not bc in bc_2_phase: # bc_2_phase[bc] = hp ## bc count for _, row in frags2.iterrows(): s = max(rStart, row["frag_start"]) e = min(rEnd, row["frag_end"]) if s >= e: continue coverage[1][3][(s - rStart):(e - rStart)] += 1 coverage[1][row["hp"]][(s - rStart):(e - rStart)] += 1 # discretization and then print out in the bedGraph format for kind in range(2): ## read and then bc counts for hp in range(4): disc_cov = Disc(coverage[kind][hp]) #disc_cov = [ discretize(x) for x in coverage[hp] ] sel = (disc_cov[:-1] != disc_cov[1:]) print sel.sum() pos = np.arange(len(disc_cov)) boundaries = np.append(0, [x + 1 for x in pos[sel]]) boundaries = np.append(boundaries, len(disc_cov)) print disc_cov.size, sel.sum(), boundaries.size #print coverage[hp][:10] #print disc_cov[:min(10,len(disc_cov))] for i in range(len(boundaries) - 1): fouts[kind][hp].write( "%s\t%d\t%d\t%d\n" % (chrom, boundaries[i] + rStart, boundaries[i + 1] + rStart, disc_cov[boundaries[i]])) #disc_cov = Disc(coverage) # discereitzed coverage for kind in range(2): ## read and then bc counts for hp in range(4): fouts[kind][hp].close()
def main(args, outs): vc_mode, _, _, _ = tk_io.get_vc_mode(args.vc_precalled, args.vc_mode) (chrom, start, stop) = tk_io.get_locus_info(args.locus) chrom = str(chrom) if chrom in ['chrM', 'MT', 'M'] or (args.sex.lower() in ["f", "female"] and chrom in ["chrY", "Y"]): return fragment_barcode_info = pysam.Tabixfile(args.fragment_phasing) AH_0_BH_0 = ( 'AH_0_BH_0', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 0' ) AH_1_BH_1 = ( 'AH_1_BH_1', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 1' ) AH_0_BH_1 = ( 'AH_0_BH_1', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 0 which are on reads that have support for the allele which has been phased as haplotype 1' ) AH_1_BH_0 = ( 'AH_1_BH_0', '1', 'Integer', 'Number of barcodes that have been called as supporting haplotype 1 which are on reads that have support for the allele which has been phased as haplotype 0' ) BX_HAP_OR = ( 'BX_HAP_OR', '1', 'Float', "Barcode aware haplotype filtering score (log odds ratio currently)") BARCODE_AWARE_FILTER = [( "BARCODE_AWARE_FILTER", "Uses haplotype information from the fragments and the alleles to filter some variants that are not consistent with haplotype (ie variants should have most of their allele haplotype 0 alleles coming from barcodes whose fragments are haplotype 0 etc)" )] extra_fields = [AH_0_BH_0, AH_1_BH_1, AH_0_BH_1, AH_1_BH_0, BX_HAP_OR] input_variants = tk_io.VariantFileReader(args.variants) with open(outs.default.strip(".gz"), 'w') as output_file: output_variants = tk_io.VariantFileWriter( output_file, template_file=open(args.variants, 'r'), new_info_fields=extra_fields, new_filters=BARCODE_AWARE_FILTER) variant_iterator = tk_io.get_variant_iterator_pos( input_variants, None, args.locus) for record in variant_iterator: sample = record.samples[0] ref = tk_io.get_record_ref(record) alt_alleles = tk_io.get_record_alt_alleles(record) if not tk_io.get_record_passes_filters(record): output_variants.write_record(record) continue if len(sample.gt_alleles) > 1: genotype_1 = int(sample.gt_alleles[0]) genotype_2 = int(sample.gt_alleles[1]) if genotype_1 == genotype_2: output_variants.write_record(record) continue #homozygous, can't filter this way else: output_variants.write_record(record) continue #homozygous, can't filter this way chrom = tk_io.get_record_chrom(record) if not chrom == "chrM": variant_barcode_info = load_variant_barcode_phasing_info( record, fragment_barcode_info) if not barcode_aware_filter(record, variant_barcode_info): if record.FILTER is None: record.FILTER = [] if tk_io.get_var_type(ref, alt_alleles[0]) == "S" and ( (vc_mode == 'call') or (vc_mode == "precalled_plus" and "TENX" in record.INFO)): record.FILTER.append("BARCODE_AWARE_FILTER") output_variants.write_record(record)