def preprocess_bam_to_bed(bam, output): ''' Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are ''' # convert bam to bed vprint("Converting bam to bed...") bam = BedTool(bam) bed = bam.bam_to_bed() # filter intervals vprint("Filter reads by size...") bed_chunk_iter = bed.to_dataframe(chunksize=10000000) # chunk large file chunks = [] for chunk in bed_chunk_iter: keep = ( chunk[["start", "end"]] .swifter.progress_bar(enable=True, desc=bam) .apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1) ) chunks.append(chunk[keep]) bed_df = pd.concat(chunks) # 101bp interval for input vprint("Define 101bp intervals...") bed_df["end"] = ( bed_df["start"].swifter.progress_bar( enable=True).apply(define_interval) ) bed_df["name"] = "-" # remove duplicates vprint("Drop duplicate intervals...") bed_df.drop_duplicates(inplace=True) # TODO extraneous chromosomes? vprint("Remove extra chromosomes...") chromosomes = list(range(1, 23)) chromosomes.append('X') chromosomes.append('Y') chromosomes = [f'chr{c}' for c in chromosomes] bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)] # Save result vprint(f"Saving {bed_df.shape[0]} intervals...") BedTool.from_dataframe(bed_df).moveto(output) # cleanup tmp files pybedtools.cleanup(remove_all=True) vprint("Done.")
# Convert FASTQ to dict fq_dict = {} for i in range(len(fq)): if i % 4 == 0: read_name = fq[i] elif i % 4 == 1: dna = fq[i] elif i % 4 == 3: qual = fq[i] # Record one read into dict fq_dict[read_name] = (dna, qual) read_names = list(fq_dict.keys()) # Convert miniBAM to BED U1U11 = BedTool(bed_path) bam = BedTool(bam_tmp) bed = bam.bam_to_bed().intersect(U1U11, wa=True, wb=True).to_dataframe() bed['SELF_NAME'] = '@' + bed['name'] bed['QNAME'] = [i.split('/')[0] for i in bed['SELF_NAME']] # QNAME as in BAM bed['ORDER'] = [i.split('/')[1] for i in bed['SELF_NAME']] # First or second in pair # Add read names for mates bed.loc[bed['ORDER'] == '1', 'MATE_NAME'] = bed.loc[bed['ORDER'] == '1', 'QNAME'] + '/2' bed.loc[bed['ORDER'] == '2', 'MATE_NAME'] = bed.loc[bed['ORDER'] == '2', 'QNAME'] + '/1' # Add some True/False indicators bed['SELF'] = bed['SELF_NAME'].isin(read_names) # is self in FASTQ? bed['MATE'] = bed['MATE_NAME'].isin(read_names) # is mate in FASTA? bed['PAIRED'] = np.logical_and( bed['SELF'], bed['MATE']) # Whether the pair can be found in the FASTQ # Remove reads that are not in FASTQ
# Input if len(sys.argv) == 5: realnBAM = sys.argv[1] # bowtie2 re-aligned bam (rbam for short) oriBAM = sys.argv[2] # original miniBAM (obam) out_prefix = sys.argv[3] U1U11 = sys.argv[4] # U1U11 w/ pseudo genes else: sys.stderr.write( 'Incorrect arguments. Usage: postprocess_realign.py realign_BAM original_BAM out_prefix U1_bed' ) sys.exit(1) #### # Remove non-optimal multiple alignments #### rbam_BT = BedTool(realnBAM) # rbam in BedTool rbed_DF = rbam_BT.bam_to_bed(tag='AS').to_dataframe() # Bowtie2 end-to-end returns negative scores so bedtools add 256 to the AS for negative values. # Therefore, 0 score should actually be fixed to 256 rbed_DF.loc[rbed_DF.score == 0, 'score'] = 256 # Only keep multi-alignments with max AS per read max_score = rbed_DF.pivot_table(index='name', values='score', aggfunc=max).reset_index() rbed_DF = rbed_DF.set_index(['name', 'score']).sort_index() keep = max_score.set_index(['name', 'score']).index.values.tolist() rbed_DF = rbed_DF.loc[keep, ].reset_index() rbed_DF = rbed_DF.loc[:, ('chrom', 'start', 'end', 'name', 'score', 'strand')] multimap_ct = rbed_DF.name.value_counts( ) # number of times that each read aligns rbed_BT = BedTool.from_dataframe(rbed_DF) # create filtered version of rbed # Intersect with core U1U11 rbed_BT = rbed_BT.intersect(U1U11, wa=True, wb=True)
def determine_coverage(bedfile,bamfile,target_site_mutations): #bams and bed through bedtools target_sites = BedTool(bedfile) bambam = BedTool(bamfile) bambed = bambam.bam_to_bed().merge() covered_ts = target_sites.intersect(bambed) non_covered_ts = target_sites.subtract(bambed) site_summary = {} """ Get states for sites associated with each chrom. """ for bed,state in zip([non_covered_ts,covered_ts],['no_coverage','no_mutation']): for line in bed: # break out parts of bed. # bed must have 5 columns. chrom, start,stop,name,strand = line if chrom not in site_summary: site_summary[chrom] = {} assert name not in site_summary[chrom],textwrap.dedent("""\ {n} is a repeated site name on {c}. All site names must be unique. Please modify the name in your target site bed file. Sites should be named <gene><3 letter AA symbol><ref. AA position> Examples: alsGly121 epspsLys201 """.format(n=name,c=chrom)) site_summary[chrom][name]=[int(start),state] # give start position and False for if its covered by reads. #read in scored target_site mutations. f = open(target_site_mutations) tsm = f.read().rstrip().split('\n') if len(tsm) >1 : #only these will have TSMs for line in tsm[1::]: sample, aa_sub, chrom,pos,codon_pos,ref,alt = line.split('\t') key_name = aa_sub[:-3] assert chrom in site_summary,textwrap.dedent("""\ {c} appears in target site mutations file: {tsm} but not in the supplied bed file: {b} """.format(c=chrom,tsm=target_site_mutations, b=bedfile)) assert key_name in site_summary[chrom],textwrap.dedent("""\ {c} appears to be referenced in target site mutations file: {tsm} but not in the supplied bed file: {b} It may be that amino acid symbol is incorrectly formatted. this program requires a 3 letter symbol, e.g. Pro for proline. The error was produced by slicing the last 3 items off this string: {s} """.format(c=key_name,tsm=target_site_mutations, b=bedfile,s=aa_sub)) site_summary[chrom][key_name][-1] =textwrap.dedent("""\ {aa} DNA:{ref}\u2192{alt} codon_pos:{c} """.format(aa=aa_sub[-9:: ], ref=ref,alt=alt,c=codon_pos) ) print(site_summary[chrom][key_name][-1]) """ for chrom in site_summary:
""" Male read x gene matrix for weight calculation """ #!/usr/bin/env python # coding: utf-8 import pysam, sys, os import numpy as np import pandas as pd from pybedtools import BedTool # Input if len(sys.argv) == 4: bam = sys.argv[1] # bowtie2 re-aligned & filtered bam (rbam for short) out_prefix = sys.argv[2] U1U11 = sys.argv[3] # U1U11 w/ pseudo genes else: sys.stderr.write('Incorrect arguments. Usage: postprocess_realign.py realign_BAM out_prefix U1_bed') sys.exit(1) rbam_BT = BedTool(bam) # rbam in BedTool rbed_BT = rbam_BT.bam_to_bed(ed=True) rbed_DF = rbed_BT.to_dataframe() # Intersect with U1U11 rbed_U1U11 = rbed_BT.intersect(U1U11, wa=True, wb=True).to_dataframe(names=('chrom', 'start', 'end', 'name', 'score', 'strand', 'chrom2', 'start2', 'end2', 'gene', 'score2', 'strand2')) # Make read x gene matrix multimap_ct = rbed_DF.name.value_counts() # number of times that each read aligns ct_mat = rbed_U1U11.pivot_table(index='name', columns='gene', values='start', aggfunc=len) ct_mat = ct_mat.fillna(0).astype(np.int) ct_mat['num_align'] = multimap_ct[ct_mat.index] ct_mat['U1U11_map'] = ct_mat.loc[:, ('RNU1-1', 'RNU1-2', 'RNU1-27P', 'RNU1-28P', 'RNU1-3', 'RNU1-4', 'RNVU1-18', 'RNU11')].sum(1) ct_mat['Other'] = np.where(ct_mat.num_align>ct_mat.U1U11_map, 1, 0) ct_mat.to_csv(out_prefix + '.mstat.csv')
realnBAM = sys.argv[1] # bowtie2 re-aligned bam (rbam for short) out_prefix = sys.argv[2] else: sys.stderr.write( 'Incorrect arguments. Usage: postprocess_realign.py realign_BAM out_prefix' ) sys.exit(1) #### # Remove non-optimal multiple alignments #### rbam_BT = BedTool(realnBAM) # rbam in BedTool cnames = [ 'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'name', 'ed', 'strand1', 'strand2' ] rbed_DF = pd.read_table(rbam_BT.bam_to_bed(ed=True, mate1=True, bedpe=True).fn, header=None, names=cnames) rbed_DF = rbed_DF.loc[rbed_DF['chrom1'] != '.'] rbed_DF = rbed_DF.loc[rbed_DF['chrom2'] != '.'] # Only keep multi-alignments with min ED per pair min_ed = rbed_DF.pivot_table(index='name', values='ed', aggfunc=min).reset_index() rbed_DF = rbed_DF.set_index(['name', 'ed']).sort_index() keep = min_ed.set_index(['name', 'ed']).index.values.tolist() rbed_DF = rbed_DF.loc[keep, ].reset_index() # bedpe to BED mate1 = rbed_DF.loc[:, ('chrom1', 'start1', 'end1')] mate1['name'] = rbed_DF['name'] + '/1' mate2 = rbed_DF.loc[:, ('chrom2', 'start2', 'end2')] mate2['name'] = rbed_DF['name'] + '/2'
fn_sorted = tmpdir + "/sorted.bam" fn_fixedmates = tmpdir + "/fixedmates.bam" # sort by id logging.debug("calling samtools sort") pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix") # fix mate information # also removes secondary and unmapped reads logging.debug("calling samtools fixmates") pysam.fixmate("-r", fn_sorted, fn_fixedmates) # bedtools bam2bed alns = BedTool(fn_fixedmates) alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True) # determine alignment ends and write to file with (open(args.outfile, "w") if args.outfile is not None else stdout) as out: for i in alns_bedpe: chrom = i.fields[0] fmstart = i.fields[1] fmend = i.fields[2] smstart = i.fields[4] smend = i.fields[5] readid = i.fields[6] score = i.fields[7] fmstrand = i.fields[8] if fmstrand == "+": start = fmstart end = smend