def parse(self): idx = HTSeq.GenomicArrayOfSets("auto", stranded=False) if self.gtf_file: # could be None of no gtf file is provided log.info("Loading " + self.gtf_file) gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True) n = 0 for feature in gtf_file: if feature.type == "gene": if 'gene_name' in feature.attr: name = feature.attr['gene_name'] elif 'Name' in feature.attr: name = feature.attr['Name'] elif 'gene' in feature.attr: name = feature.attr['gene'] else: name = feature.name if feature.iv.chrom[0:3] == 'chr': feature.iv.chrom = feature.iv.chrom[3:] idx[feature.iv] += name n += 1 log.info("Loaded " + str(n) + " features") return idx
def add_regions_from_bed(self, regions_bed_file): log.info("Parsing regions blacklist file: " + str(regions_bed_file)) header = True with open(regions_bed_file, 'r') as fh: for line in fh: if not header: params = line.strip("\t\n ").split("\t") if len(params) > 1: for i in [1, 2]: params[i] = int(params[i]) d = params[2] - params[1] if d < 1: raise ValueError( "Too small region (starts are 0-based, ends are 1-based, like BED):\n" + line) if len(params) >= 5: self.add_region(params[0], params[1], params[2], params[3], params[4]) else: self.add_region(params[0], params[1], params[2], params[3], None) else: header = False log.info("Added " + str(self.r) + " regions to the blacklist")
def add_junctions_from_file(self, junction_file): log.info("Parsing junction blacklist file: " + str(junction_file)) header = True with open(junction_file, 'r') as fh: for line in fh: if not header: params = line.strip().split("\t") if len(params) > 1: for i in [1, 2, 5, 6]: params[i] = int(params[i]) d1 = params[2] - params[1] d2 = params[6] - params[5] if d1 < 1 or d2 < 1: raise ValueError( "Too small junction (starts are 0-based, ends are 1-based, like BED):\n" + line) if (params[4] < params[0]) or (params[0] == params[4] and params[5] < params[1]): reg1 = (params[4], params[5], params[6], params[7]) reg2 = (params[0], params[1], params[2], params[3]) else: reg1 = (params[0], params[1], params[2], params[3]) reg2 = (params[4], params[5], params[6], params[7]) if len(params) >= 9: self.add_junction(reg1, reg2, params[8]) else: self.add_junction(reg1, reg2, None) else: header = False log.info("Added " + str(self.j) + " junctions to the blacklist")
def integrate(self, output_table, gtf_file, fasta_file): log.info("Integrating results") def insert_in_index(index, entries, score, i): if score not in index: index[score] = {} key = entries[0].chrA + ':' + str( entries[0].posA ) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str( entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i) index[score][key] = entries with open(output_table, 'w') as fh_out: header = self.header.split("\t") header = "\t".join(header[:-5] + [ 'full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance', "exons from (5')", "exons to (3')" ] + header[-5:]) fh_out.write("shared-id\tfusion\t" + header) # index used to find duplicates self.idx = HTSeq.GenomicArrayOfSets("auto", stranded=True) # index used to annotate gene names: TMPRSS2->ERG gene_annotation = GeneAnnotation(gtf_file) dfs = DetectFrameShifts(gtf_file) if gtf_file else None ffs = Fasta(fasta_file) if fasta_file else None intronic_linear = [] remainder = [] # Find 'duplicates' or fusions that belong to each other log.info( "Searching for intronic and exonic breaks that belong to the same event" ) for e in self: if dfs and e.RNAstrandA != '.' and e.RNAstrandB != '.': done_breaks = set([]) if e.donorA > e.donorB: exons_from, exons_to, frame_shifts = dfs.evaluate( [e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2) else: exons_from, exons_to, frame_shifts = dfs.evaluate( [e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2) done_breaks.add(e.chrA + ':' + str(e.posA) + '/' + str(e.posA + 1) + '(' + e.strandA + ')->' + e.chrB + ':' + str(e.posB) + '/' + str(e.posB + 1) + '(' + e.strandB + ')') fgd = [x[0] + '->' + x[1] for x in frame_shifts['fgd']] frameshifts_0 = [ x[0][0] + '->' + x[1][0] for x in frame_shifts[0] ] frameshifts_1 = [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1] ] frameshifts_2 = [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2] ] for additional_breaks in e.structure.split('&'): if additional_breaks != '': params = additional_breaks.split(':(') n_split_reads = sum([ int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates' ]) posAB = params[0].split(':') posA, posB = int(posAB[1].split('/')[0]), int( posAB[2].split('/')[0]) if params[ 0] not in done_breaks and n_split_reads > 0: if e.donorA > e.donorB: # nice, use same thing to swap if necessary exons_from_, exons_to_, frame_shifts = dfs.evaluate( [e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2) else: exons_from_, exons_to_, frame_shifts = dfs.evaluate( [e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2) exons_from += exons_from_ exons_to += exons_to_ del (exons_from_, exons_to_) fgd += [ x[0] + '->' + x[1] for x in frame_shifts['fgd'] ] frameshifts_0 += [ x[0][0] + '->' + x[1][0] for x in frame_shifts[0] ] frameshifts_1 += [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1] ] frameshifts_2 += [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2] ] done_breaks.add(params[0]) e.exons_from = sorted(list(set(exons_from))) e.exons_to = sorted(list(set(exons_to))) del (exons_from, exons_to) e.fgd = ','.join(sorted(list(set(fgd)))) e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0)))) e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1)))) e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2)))) del (fgd, frameshifts_0, frameshifts_1, frameshifts_2) if ffs: e.is_on_splice_junction_motif(ffs) if e.x_onic == 'intronic' and e.circ_lin == 'linear': intronic_linear.append(e) else: remainder.append(e) def insert(pos, e): if pos[0][0:3] == 'chr': chrom = pos[0][3:] else: chrom = pos[0] # position_accession = HTSeq.GenomicPosition(pos[0], pos[1], pos[2]) position_accession = HTSeq.GenomicInterval( chrom, pos[1], pos[1] + 1, pos[2]) position = self.idx[position_accession] position += e insert((e.chrA, e.posA, e.strandA), e) insert((e.chrB, e.posB, e.strandB), e) if ffs != None: ffs.close() # Reorder log.info("Re-order and find matching entries") idx2 = {} q = 0 for e in intronic_linear: results_split = [set([]), set([])] positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)] for pos_i in [0, 1]: pos = positions[pos_i] if pos[2] == '-': pos1 = pos[1] - 200000 pos2 = pos[1] else: pos1 = pos[1] pos2 = pos[1] + 200000 if pos[0][0:3] == 'chr': chrom = pos[0][3:] else: chrom = pos[0] for step in self.idx[HTSeq.GenomicInterval( chrom, max(0, pos1), pos2, pos[2])].steps(): for e2 in [_ for _ in step[1] if _ != e]: if e2.strandA == e.strandA and e2.strandB == e.strandB: results_split[pos_i].add(e2) results = results_split[0].intersection(results_split[1]) top_result = (None, 9999999999999) for r in results: d1 = (r.posA - e.posA) d2 = (r.posB - e.posB) sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2)) shared_score = math.sqrt( (pow(e.score, 2) + pow(r.score, 2)) * 0.5) penalty = 1.0 * sq_d / shared_score if penalty < top_result[1]: top_result = (r, penalty) if top_result[0]: insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q) else: insert_in_index(idx2, [e], e.score, q) q += 1 for e in remainder: insert_in_index(idx2, [e], e.score, q) q += 1 log.info("Determining fusion gene names and generate output") # Generate output i = 1 exported = set([]) for score in sorted(idx2.keys(), reverse=True): for key in sorted(idx2[score].keys()): added = 0 for entry in idx2[score][key]: if entry not in exported: acceptors_donors = entry.get_donors_acceptors( gene_annotation) line = entry.line[:-5] + [ entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif, ",".join( entry.exons_from), ",".join(entry.exons_to) ] + entry.line[-5:] fh_out.write( str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n") exported.add(entry) added += 1 if added > 0: i += 1
def classify(self, output_file, only_valid, blacklist, min_chim_overhang, ffpe_mismatch_ratio): log.info("Loading " + output_file + "[only_valid=" + { True: 'true', False: 'false' }[only_valid] + "]") n = 0 k = 0 with open(output_file, 'w') as fh: fh.write(str(self.get_header())) for e in self: if isinstance(e, str): fh.write(e) else: def classify_intronic_exonic(): n_edges_max = int(round(0.00575 * e.score + 5.75, 0)) if e.n_edges >= n_edges_max: e.x_onic = 'exonic' classify_intronic_exonic() status = [] n += 1 # all_entropy_min_f1 = 0.705 + (math.atan((e.score - 150.0) * 0.005) * 0.035) # all_entropy_min_f2 = -0.1375 * math.tanh((e.score - 2250.0) / 900.0) + 0.6175 all_entropy_min = -0.26 * math.tanh( (e.score - 25.0) / 20.0) + 0.6225 all_entropy_max = -1.0 * (max(e.score, 171) - 175.0) / ( 5.0 + max(e.score, 171) - 175.0) + (1.0 + 0.965) if e.entropy_all_edges < all_entropy_min: status.append("entropy=" + str(e.entropy_bp_edge) + '<' + str(round(all_entropy_min, 4))) if e.entropy_all_edges > all_entropy_max: status.append("entropy=" + str(e.entropy_bp_edge) + '>' + str(round(all_entropy_max, 4))) # @todo subfunc n_disco_min = int(round(pow(((e.n_nodes - 2) * 0.22), 1.7))) if e.n_discordant_reads < n_disco_min: status.append("n_discordant_reads=" + str(e.n_discordant_reads) + "<" + str(n_disco_min)) # @todo subfunc n_support_min = (0.215 * pow(max(0, e.n_nodes) - 1.0, 1.59)) + 6.5 n_support_min = int(round(n_support_min)) if e.n_supporting_reads < n_support_min: status.append("n_support=" + str(e.n_supporting_reads) + "<" + str(n_support_min)) # @todo subfunc n_disco_max = int( round(math.pow(22.0 * e.n_split_reads, 0.9) + 13)) if e.n_split_reads < 100: n_disco_min = int( round(math.pow(0.0195 * e.n_split_reads, 1.95))) elif e.n_split_reads >= 100 and e.n_split_reads < 125: n_disco_min = 4 elif e.n_split_reads >= 125 and e.n_split_reads < 325: n_disco_min = int( round((0.135 * (e.n_split_reads - 200.0)) + 14.0)) else: n_disco_min = int( round(30.875 + (e.n_split_reads - 325) * 0.024)) if e.n_discordant_reads > n_disco_max: status.append("n_disco=" + str(e.n_discordant_reads) + ">" + str(n_disco_max)) if e.n_discordant_reads < n_disco_min: status.append("n_disco=" + str(e.n_discordant_reads) + "<" + str(n_disco_min)) # @todo subfunc n_split_min = int( round((0.32 * e.n_supporting_reads) - pow((0.1 * e.n_supporting_reads), 1.15) - 4.0)) if e.n_supporting_reads < 385: n_split_max = int( round((0.986 * e.n_supporting_reads) - pow( 0.00535 * e.n_supporting_reads, 3.99 - ((1.0 / 15000.0) * e.n_supporting_reads)))) else: n_split_max = int(round(0.94 * e.n_supporting_reads)) if e.n_split_reads < n_split_min: status.append("n_split=" + str(e.n_split_reads) + "<" + str(n_split_min)) if e.n_split_reads > n_split_max: status.append("n_split=" + str(e.n_split_reads) + ">" + str(n_split_max)) # @todo subfunc slope = 51.0 bp_pos_stddev_max = -(slope * e.nodes_edge) + 15 + (2.0 * slope) if e.bp_pos_stddev > bp_pos_stddev_max: status.append("bp_pos_stddev=" + str(e.bp_pos_stddev) + ">" + str(bp_pos_stddev_max)) # @todo subfunc clips_min = (0.19 * e.score) - 25.0 clips_max = (0.84 * e.score) + 550.0 if e.clips < clips_min: status.append("clips=" + str(e.clips) + "<" + str(clips_min)) if e.clips > clips_max: status.append("clips=" + str(e.clips) + ">" + str(clips_max)) # @todo subfunc blacklisted = blacklist.is_blacklisted( (e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)) if len(blacklisted) > 0: status.append("blacklist=" + '&'.join(blacklisted)) # @todo subfunc log_ratio_slope_max = (3.6 / 2) log_ratio_rvalue_max = (0.8 / 2) log_ratio_slope = abs( math.log( (e.lr_A_slope + 0.0001) / (e.lr_B_slope + 0.0001))) log_ratio_rvalue = abs( math.log((e.lr_A_rvalue + 0.0001) / (e.lr_B_rvalue + 0.0001))) if log_ratio_slope > log_ratio_slope_max: status.append("log_ratio_slope=" + str(round(log_ratio_slope, 2)) + ">" + str(round(log_ratio_slope_max, 2))) if log_ratio_rvalue > log_ratio_rvalue_max: status.append("log_ratio_rvalue=" + str(round(log_ratio_rvalue, 2)) + ">" + str(round(log_ratio_rvalue_max, 2))) # @todo subfunc # FFPE material seems to have a substantial higher amount of mismatches per base, though randomly distributed # if we ever make a v2 of dr-disco that incorporates the concordant reads, this variable can be dertemined with some kind of calibration # now we're only estimating the MM ratio without entropy per position # [CGCGCTATATCTCGATCGCCCTTAGAGATCCTTTCGAGAGAGCTCTAGAGCG] SOME KIND OF REFERENCE SEQUENCE # CGCG*TATAT*TC TTTC*AGAGAGCT*TAG The more randomly dispersed mismatches are more trustworthy (right side example) # CGCG*TATAT*TCGAT TTCGAGAG*GCTCT # GCG*TATAT*TCG T*CGAGAGAG*TCTA # GCG*TATAT*TCGA TTCG*GAGAGCTCTA # CG*TATAT*TCGAT TTCGAG*GAGCTCTAG # G*TATAT*TCG TCGA*AGA*CTCT # if ffpe_mismatch_ratio: log_value_max = -6.45 - ((e.score + 6750.0) / (4400.0 - (e.score + 6750.0))) else: log_value_max = -4.7 log_value = math.log((float(e.mismatches) + 0.0000001) / float(e.alignment_score)) if log_value > log_value_max: status.append("many_muts=" + str(round(log_value, 2)) + ">" + str(round(log_value_max, 2))) # @todo subfunc lr_a = e.lr_A_pvalue * e.lr_A_intercept lr_b = e.lr_A_pvalue * e.lr_B_intercept lr_symmetry_max = -e.score / (0.11 + (0.0246 * (e.score))) + 41 n_lr_symmetry = pow(pow(lr_a, 2) + pow(lr_b, 2), 0.5) if n_lr_symmetry >= lr_symmetry_max: status.append("n_lr_symmetry=" + str(round(n_lr_symmetry, 2)) + ">=" + str(round(lr_symmetry_max, 2))) # @todo subfunc chim_overhang = min(e.break_A_max_AS, e.break_B_max_AS) if chim_overhang < min_chim_overhang: status.append("chim_overhang=" + str(chim_overhang) + "<" + str(min_chim_overhang)) # @todo subfunc if e.score <= 150: lr_intercept_max = (-31.0 * ( (e.score + 100.0) / (1800.0 + e.score + 100.0))) + 85.5 else: lr_intercept_max = ( (e.score - 150.0) * 0.0225) + 81.71951 if e.lr_A_intercept > lr_intercept_max: status.append("lr_A_intercept=" + str(e.lr_A_intercept) + ">" + "{:.12g}".format(lr_intercept_max)) if e.lr_B_intercept > lr_intercept_max: status.append("lr_B_intercept=" + str(e.lr_B_intercept) + ">" + "{:.12g}".format(lr_intercept_max)) # @todo subfunc sqrt_entropy_bps_ab = pow( pow(e.entropy_disco_bps_A, 2) + pow(e.entropy_disco_bps_B, 2), 0.5) if e.entropy_all_edges <= 0.85: sqrt_entropy_bps_ab_max = 0.475 else: sqrt_entropy_bps_ab_max = 3.4 * e.entropy_all_edges - 2.415 if sqrt_entropy_bps_ab > sqrt_entropy_bps_ab_max: status.append("sqrt_entropy_bps_ab=" + "{:.12g}".format(sqrt_entropy_bps_ab) + ">" + str(round(sqrt_entropy_bps_ab_max, 5))) if len(status) == 0: e.status = 'valid' fh.write(str(e)) k += 1 elif not only_valid: e.status = ','.join(status) fh.write(str(e)) log.info("Classified " + str(k) + "/" + str(n) + " as valid")
def index_gtf(self): """ GTF file must have: CDS entries for coding sequences each CDS entry must have: - source - gene_name attribute - transcript_id attribute - transcript_version attribute - exon_number attribute Such gtf files are provided by Ensembl """ log.info("Loading GTF file " + self.gtf_file + " for protein frameshift analysis") def load_gtf_per_transcript(): transcript_idx = {} gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True) for feature in gtf_file: gtf_type = feature.type.lower() if gtf_type in ['cds', 'exon']: try: transcript_id = feature.attr[ 'gene_name'] + '(' + feature.attr[ 'transcript_id'] + '.' + feature.attr[ 'transcript_version'] + ')-' + feature.source if transcript_id not in transcript_idx: transcript_idx[transcript_id] = {} exon_number = int(feature.attr['exon_number']) if exon_number not in transcript_idx[transcript_id]: transcript_idx[transcript_id][exon_number] = { 'exon': None, 'cds': None } if gtf_type in ['exon', 'cds']: transcript_idx[transcript_id][exon_number][ gtf_type] = feature except KeyError: log.warn( "Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection. Ensembl GTF files are known to be compatible." ) # there is no GFF_Reader.close() so break it dirty: break return transcript_idx def insert_transcript_idx(transcript_idx): def clean_chrom(chrom): if chrom[0:3] == 'chr': return chrom[3:] else: return chrom def calc_from(feature): if feature.iv.strand == '+': return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom), feature.iv.end, feature.iv.end + 1, feature.iv.strand) elif feature.iv.strand == '-': return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom), feature.iv.start, feature.iv.start + 1, feature.iv.strand) def calc_to(feature): if feature.iv.strand == '+': return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom), feature.iv.start, feature.iv.start + 1, feature.iv.strand) elif feature.iv.strand == '-': return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom), feature.iv.end, feature.iv.end + 1, feature.iv.strand) """ @todo change to: for exon in exons: if there is a CDS with similar exon-id: if there is no stop codon or this exon id is the last exon id: last_coding_exon = exon_id if there is a start_codon with the same exon-id: first_coding_exon = exon_id - see if this is pre-coding - add to pre-coding 'from' idx - add to pre-coding 'to' list - als je naar dit exon fuseert moet transcriptie nog starten - see if this is first coding exon - add to normal 'from' list - add to pre-coding 'to' list - see if this is an inbetween coding exon - add to normal 'from' list - add to normal 'to' list - see if this is the last coding exon - add to normal 'to' list """ for transcript_id in transcript_idx: coding = "pre" cumulative_offset = 0 exon_ids = sorted(transcript_idx[transcript_id].keys()) for e in exon_ids: exon = transcript_idx[transcript_id][e] if coding == "pre": if exon['cds'] is None: # - pre coding # distances are not relevant self.gene_annotation_to_fgd[calc_to( exon['exon'])] += (transcript_id) self.gene_annotation_from_fgd[calc_from( exon['exon'])] += (transcript_id) else: # - first coding length = (exon['cds'].iv.end - exon['cds'].iv.start) + cumulative_offset off1 = length % 3 off2 = -length % 3 self.gene_annotation_from[calc_from( exon['exon'])] += (transcript_id, off1) self.gene_annotation_to_fgd[calc_to( exon['exon'])] += (transcript_id) cumulative_offset = off1 coding = True elif coding is True: if e == exon_ids[-1] or transcript_idx[transcript_id][ e + 1]['cds'] is None: # - last coding self.gene_annotation_to[calc_to( exon['exon'])] += (transcript_id, off2) coding = "post" else: # - middle coding self.gene_annotation_to[calc_to( exon['exon'])] += (transcript_id, off2) length = (exon['cds'].iv.end - exon['cds'].iv.start) + cumulative_offset off1 = length % 3 off2 = -length % 3 self.gene_annotation_from[calc_from( exon['exon'])] += (transcript_id, off1) cumulative_offset = off1 # else: # - post coding transcript_idx = load_gtf_per_transcript() insert_transcript_idx(transcript_idx)
def convert(self, bam_file_discordant_fixed, temp_dir): basename, ext = os.path.splitext( os.path.basename(self.input_alignment_file)) basename = temp_dir.rstrip("/") + "/" + basename # @TODO / consider todo - start straight from sam # samtools view -bS samples/7046-004-041_discordant.Chimeric.out.sam > samples/7046-004-041_discordant.Chimeric.out.unsorted.bam log.info( "Convert into a name-sorted bam file, to get all reads with the same name adjacent to each other" ) pysam.sort("-o", basename + ".name-sorted.bam", "-n", self.input_alignment_file) log.info("Fixing sam file") sam_file_discordant = pysam.AlignmentFile( basename + ".name-sorted.bam", "rb") header = sam_file_discordant.header header['RG'] = [] header['PG'] = [] fh = pysam.AlignmentFile(basename + ".name-sorted.fixed.sam", "wb", header=header) for read in sam_file_discordant: tag = read.get_tag('RG') if tag in [ 'spanning_singleton_1', 'spanning_singleton_1_r', 'spanning_singleton_2', 'spanning_singleton_2_r' ]: read.is_paired = False read.is_read1 = False read.is_read2 = False read.next_reference_id = None read.next_reference_start = None read.set_tag('RG', None) read.set_tag('SA', None) read.set_tag('FI', None) read.set_tag('LB', None) fh.write(read) fh.close() log.info("Converting fixed file into BAM") fhq = open(basename + ".name-sorted.fixed.bam", "wb") fhq.write(pysam.view('-bS', basename + ".name-sorted.fixed.sam")) fhq.close() log.info("Sorting position based fixed file") pysam.sort("-o", basename + ".sorted.fixed.bam", basename + ".name-sorted.fixed.bam") log.info("Indexing the position sorted bam file") pysam.index(basename + ".sorted.fixed.bam") log.info("Cleaning up temp files") for fname in [ basename + ".name-sorted.bam", basename + ".name-sorted.fixed.sam", basename + ".name-sorted.fixed.bam" ]: log.debug("=> " + fname) os.remove(fname) log.info("Moving to final destination") shutil.move(basename + ".sorted.fixed.bam", bam_file_discordant_fixed) shutil.move(basename + ".sorted.fixed.bam" + ".bai", bam_file_discordant_fixed + ".bai")
def convert(self, bam_file_discordant_fixed, temp_dir): def randstr(n): return ''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(n)) h = hashlib.new('sha256') h.update(str_to_bytearray(self.input_alignment_file)) uid = h.hexdigest() + randstr(24) basename, ext = os.path.splitext( os.path.basename(self.input_alignment_file)) basename = temp_dir.rstrip("/") + "/" + basename + '-' + uid # @TODO / consider todo - start straight from sam # samtools view -bS samples/7046-004-041_discordant.Chimeric.out.sam > samples/7046-004-041_discordant.Chimeric.out.unsorted.bam log.info( "Convert into a name-sorted bam file, to get all reads with the same name adjacent to each other" ) pysam.sort("-o", basename + ".name-sorted.bam", "-n", self.input_alignment_file) log.info("Fixing sam file") sam_file_discordant = pysam.AlignmentFile( basename + ".name-sorted.bam", "rb") header = sam_file_discordant.header.to_dict() header['RG'] = [{ 'ID': 'discordant_mates', 'DS': 'This read has discordant mate pair' }, { 'ID': 'silent_mate', 'DS': 'Reads of this type are not discordant while their mate is' }, { 'ID': 'spanning_paired_1', 'DS': 'This read was aligned to two locations and also has an aligned mate' }, { 'ID': 'spanning_paired_1_r', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type r)' }, { 'ID': 'spanning_paired_1_s', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type s)' }, { 'ID': 'spanning_paired_1_t', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type t)' }, { 'ID': 'spanning_paired_2', 'DS': 'This read was aligned to two locations and also has an aligned mate' }, { 'ID': 'spanning_paired_2_r', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type r)' }, { 'ID': 'spanning_paired_2_s', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type s)' }, { 'ID': 'spanning_paired_2_t', 'DS': 'This read was aligned to two locations and also has an aligned mate (strand type t)' }, { 'ID': 'spanning_singleton_1', 'DS': 'This read was aligned to two locations but no aligned mate' }, { 'ID': 'spanning_singleton_1_r', 'DS': 'This read was aligned to two locations but no aligned mate' }, { 'ID': 'spanning_singleton_2', 'DS': 'This read was aligned to two locations but no aligned mate' }, { 'ID': 'spanning_singleton_2_r', 'DS': 'This read was aligned to two locations but no aligned mate' }] header['PG'] = [{ 'ID': 'drdisco_fix_chimeric', 'PN': 'drdisco fix-chimeric', 'CL': '', 'VN': __version__ }] fh = pysam.AlignmentFile(basename + ".name-sorted.fixed.sam", "wb", header=header) last_read_name = False alignments = [] for read in sam_file_discordant: if read.qname != last_read_name: if len(alignments) > 0: self.reconstruct_alignments(alignments, sam_file_discordant, fh) alignments = [] last_read_name = read.qname alignments.append(read) if len(alignments) > 0: self.reconstruct_alignments(alignments, sam_file_discordant, fh) else: os.remove(basename + ".name-sorted.bam") os.remove(basename + ".name-sorted.fixed.sam") err = "No reads were found, fixing empty sam/bam file: " + self.input_alignment_file log.error(err) raise Exception(err) fh.close() log.info("Converting fixed file into BAM") fhq = open(basename + ".name-sorted.fixed.bam", "wb") fhq.write(pysam.view('-bS', basename + ".name-sorted.fixed.sam")) fhq.close() log.info("Sorting position based fixed file") pysam.sort("-o", basename + ".sorted.fixed.bam", basename + ".name-sorted.fixed.bam") log.info("Indexing the position sorted bam file") pysam.index(basename + ".sorted.fixed.bam") log.info("Cleaning up temp files") for fname in [ basename + ".name-sorted.bam", basename + ".name-sorted.fixed.sam", basename + ".name-sorted.fixed.bam" ]: log.debug("=> " + fname) os.remove(fname) log.info("Moving to final destination") shutil.move(basename + ".sorted.fixed.bam", bam_file_discordant_fixed) shutil.move(basename + ".sorted.fixed.bam" + ".bai", bam_file_discordant_fixed + ".bai")