def sort_read_stats_by_read_name(input_file, output_file, tmp_dir): def sort_read_name(line): return line.strip().split('\t', QNAME_COL+1)[QNAME_COL] batch_sort(input=input_file, output=output_file, key=sort_read_name, buffer_size=32000, tempdirs=[tmp_dir])
def sort_read_stats_by_read_name(input_file, output_file, tmp_dir): def sort_read_name(line): return line.strip().split('\t', QNAME_COL + 1)[QNAME_COL] batch_sort(input=input_file, output=output_file, key=sort_read_name, buffer_size=32000, tempdirs=[tmp_dir])
def sort_bedpe(input_file, output_file, tmp_dir): # sort BEDPE file by paired chromosome/position def sortfunc(line): fields = line.strip().split('\t') return tuple([fields[0], fields[3], fields[1], fields[4]]) tempdirs = [tmp_dir] batch_sort(input=input_file, output=output_file, key=sortfunc, buffer_size=32000, tempdirs=tempdirs)
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, breakpoint_map_file, breakpoint_fasta_file, tmp_dir): # sort chimera file by breakpoint name def sortfunc(line): fields = line.strip().split('\t') return fields[Chimera.BREAKPOINT_NAME_FIELD] tempdirs = [tmp_dir] batch_sort(input=input_file, output=breakpoint_sorted_chimera_file, key=sortfunc, buffer_size=32000, tempdirs=tempdirs) # parse and build breakpoint -> chimera map fastafh = open(breakpoint_fasta_file, "w") mapfh = open(breakpoint_map_file, "w") prev_breakpoint_name = None prev_seq = None chimera_names = set() for c in Chimera.parse(open(breakpoint_sorted_chimera_file)): seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p if c.breakpoint_name != prev_breakpoint_name: if len(chimera_names) > 0: # write to fasta print >> fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) # write to map file print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join( sorted(chimera_names))) chimera_names = set() prev_seq = seq prev_breakpoint_name = c.breakpoint_name chimera_names.add(c.name) if len(chimera_names) > 0: print >> fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names)) fastafh.close() mapfh.close()
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, breakpoint_map_file, breakpoint_fasta_file, tmp_dir): # sort chimera file by breakpoint name def sortfunc(line): fields = line.strip().split('\t') return fields[Chimera.BREAKPOINT_NAME_FIELD] tempdirs = [tmp_dir] batch_sort(input=input_file, output=breakpoint_sorted_chimera_file, key=sortfunc, buffer_size=32000, tempdirs=tempdirs) # parse and build breakpoint -> chimera map fastafh = open(breakpoint_fasta_file, "w") mapfh = open(breakpoint_map_file, "w") prev_breakpoint_name = None prev_seq = None chimera_names = set() for c in Chimera.parse(open(breakpoint_sorted_chimera_file)): seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p if c.breakpoint_name != prev_breakpoint_name: if len(chimera_names) > 0: # write to fasta print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) # write to map file print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(sorted(chimera_names))) chimera_names = set() prev_seq = seq prev_breakpoint_name = c.breakpoint_name chimera_names.add(c.name) if len(chimera_names) > 0: print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names)) fastafh.close() mapfh.close()
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir): # # sort the BAM file that has cluster annotations by read name # logging.debug("Sorting newly annotated discordant BAM file by read name") qname_sorted_bam_prefix = os.path.join(tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname") qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam" pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix) # # iterate through named-sorted bam file write cluster pairs # logging.debug("Enumerating cluster pairs") tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt") tmp_cluster_fh = open(tmp_cluster_file, 'w') bamfh = pysam.Samfile(qname_sorted_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # group into 5' and 3' reads reads5p = [] reads3p = [] for reads in pe_reads: for r in reads: orientation = r.opt(ORIENTATION_TAG) if orientation == ORIENTATION_5P: reads5p.append(r) else: reads3p.append(r) # iterate through possible pairs for r5p in reads5p: for r3p in reads3p: id5p = r5p.opt(DISCORDANT_CLUSTER_TAG) id3p = r3p.opt(DISCORDANT_CLUSTER_TAG) print >>tmp_cluster_fh, '\t'.join(map(str, (id5p, id3p, r5p.qname))) bamfh.close() tmp_cluster_fh.close() # # sort cluster pairs # logging.debug("Sorting cluster pairs") tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt") def sortfunc(line): fields = line.strip().split('\t') return (fields[0], fields[1]) batch_sort(input=tmp_cluster_file, output=tmp_sorted_cluster_file, key=sortfunc, buffer_size=32000, tempdirs=[tmp_dir]) # # write cluster pairs # logging.debug("Grouping cluster pairs") pair_id = 0 outfh = open(cluster_pair_file, "w") for id5p, id3p, qnames in parse_and_group_cluster_pairs(open(tmp_sorted_cluster_file)): print >>outfh, '\t'.join(map(str, [pair_id, id5p, id3p, ','.join(qnames)])) pair_id += 1 outfh.close() # remove temporary files if os.path.exists(qname_sorted_bam_file): os.remove(qname_sorted_bam_file) if os.path.exists(tmp_cluster_file): os.remove(tmp_cluster_file) if os.path.exists(tmp_sorted_cluster_file): os.remove(tmp_sorted_cluster_file) return config.JOB_SUCCESS
def nominate_single_mapped_spanning_reads(chimera_file, single_mapped_bam_file, single_mapped_fastq_file, tmp_dir): # find sequences that could cross a breakpoint tmp_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.txt") f = open(tmp_seqs_to_remap, "w") # search for matches to 5' chimeras logging.debug("Matching single-mapped frags to 5' chimeras") for clist, reads in parse_sync_chimera_with_bam(chimera_file, single_mapped_bam_file, OrientationTags.FIVEPRIME): # TODO: test more specifically that read has a chance to cross breakpoint for r in reads: # reverse read number readnum = 1 if r.is_read1 else 0 print >> f, "\t".join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")])) # sort chimeras by 3' partner logging.debug("Sorting chimeras by 3' transcript") def sort_by_3p_partner(line): fields = line.strip().split("\t", Chimera.TX_NAME_3P_FIELD + 1) return fields[Chimera.TX_NAME_3P_FIELD] tmp_chimera_file_sorted_3p = os.path.join(tmp_dir, "tmp_chimeras.sorted3p.bedpe") batch_sort( input=chimera_file, output=tmp_chimera_file_sorted_3p, key=sort_by_3p_partner, buffer_size=32000, tempdirs=[tmp_dir], ) # search for matches to 3' chimeras logging.debug("Matching single-mapped frags to 3' chimeras") for clist, reads in parse_sync_chimera_with_bam( tmp_chimera_file_sorted_3p, single_mapped_bam_file, OrientationTags.THREEPRIME ): # TODO: test more specifically that read has a chance to cross breakpoint for r in reads: # reverse read number readnum = 1 if r.is_read1 else 0 print >> f, "\t".join(map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")])) f.close() # # now sort the file of sequences by read name/number to # eliminate duplicates # def sort_by_qname(line): fields = line.strip().split("\t") return (fields[0], int(fields[1])) tmp_sorted_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.sorted.txt") batch_sort( input=tmp_seqs_to_remap, output=tmp_sorted_seqs_to_remap, key=sort_by_qname, buffer_size=32000, tempdirs=[tmp_dir], ) # # read file and write fastq, ignoring duplicates # fqfh = open(single_mapped_fastq_file, "w") prev = None for line in open(tmp_sorted_seqs_to_remap): fields = line.strip().split("\t") qname, readnum, seq, qual = fields[0], int(fields[1]), fields[2], fields[3] cur = (fields[0], int(fields[1])) if prev != cur: if prev is not None: print >> fqfh, to_fastq(qname, readnum, seq, qual) prev = cur if prev is not None: print >> fqfh, to_fastq(qname, readnum, seq, qual) fqfh.close() # TODO: remove temporary files # os.remove(tmp_chimera_file_sorted_3p) # os.remove(tmp_seqs_to_remap) # os.remove(tmp_sorted_seqs_to_remap) return config.JOB_SUCCESS
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir): # # sort the BAM file that has cluster annotations by read name # logging.debug("Sorting newly annotated discordant BAM file by read name") qname_sorted_bam_prefix = os.path.join( tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname") qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam" pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix) # # iterate through named-sorted bam file write cluster pairs # logging.debug("Enumerating cluster pairs") tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt") tmp_cluster_fh = open(tmp_cluster_file, 'w') bamfh = pysam.Samfile(qname_sorted_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # group into 5' and 3' reads reads5p = [] reads3p = [] for reads in pe_reads: for r in reads: orientation = r.opt(ORIENTATION_TAG) if orientation == ORIENTATION_5P: reads5p.append(r) else: reads3p.append(r) # iterate through possible pairs for r5p in reads5p: for r3p in reads3p: id5p = r5p.opt(DISCORDANT_CLUSTER_TAG) id3p = r3p.opt(DISCORDANT_CLUSTER_TAG) print >> tmp_cluster_fh, '\t'.join( map(str, (id5p, id3p, r5p.qname))) bamfh.close() tmp_cluster_fh.close() # # sort cluster pairs # logging.debug("Sorting cluster pairs") tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt") def sortfunc(line): fields = line.strip().split('\t') return (fields[0], fields[1]) batch_sort(input=tmp_cluster_file, output=tmp_sorted_cluster_file, key=sortfunc, buffer_size=32000, tempdirs=[tmp_dir]) # # write cluster pairs # logging.debug("Grouping cluster pairs") pair_id = 0 outfh = open(cluster_pair_file, "w") for id5p, id3p, qnames in parse_and_group_cluster_pairs( open(tmp_sorted_cluster_file)): print >> outfh, '\t'.join( map(str, [pair_id, id5p, id3p, ','.join(qnames)])) pair_id += 1 outfh.close() # remove temporary files if os.path.exists(qname_sorted_bam_file): os.remove(qname_sorted_bam_file) if os.path.exists(tmp_cluster_file): os.remove(tmp_cluster_file) if os.path.exists(tmp_sorted_cluster_file): os.remove(tmp_sorted_cluster_file) return config.JOB_SUCCESS
def resolve_discordant_reads(input_file, output_file, isize_dist, min_isize_prob, tmp_dir): # # parse chimeras and output reads to a file # logging.debug("Getting discordant read information") read_stats_file = os.path.join(tmp_dir, "read_stats.txt") make_discordant_read_stats_file(input_file, read_stats_file, isize_dist) # # now sort the read/chimera stats list # logging.debug("Sorting reads by read name") sorted_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.txt") sort_read_stats_by_read_name(read_stats_file, sorted_read_stats_file, tmp_dir) # # parse reads by read name # logging.debug("Choosing best read groups") resolved_read_stats_file = os.path.join( tmp_dir, "read_stats.rname_sorted.resolved.txt") f = open(resolved_read_stats_file, "w") for rname, readstats in group_by_attr( ChimeraStats.parse(open(sorted_read_stats_file)), 'qname'): # build a dictionary of stats -> read/chimeras stats_dict = collections.defaultdict(lambda: []) for s in readstats: # add key/value pairs stats_dict[s.score_tuple].append(s) # sort based on stats sorted_stats_keys = sorted(stats_dict.keys(), reverse=True) # use only the best key for s in stats_dict[sorted_stats_keys[0]]: # output read -> chimera relationships print >> f, '\t'.join(map(str, s.to_list())) f.close() # # re-sort by chimera name # logging.debug("Resorting reads by chimera name") def sort_reads_by_chimera_name(line): return line.strip().split('\t', CHIMERA_NAME_COL + 1)[CHIMERA_NAME_COL] sorted_resolved_read_stats_file = os.path.join( tmp_dir, "read_stats.chimera_name_sorted.resolved.txt") batch_sort(input=resolved_read_stats_file, output=sorted_resolved_read_stats_file, key=sort_reads_by_chimera_name, buffer_size=32000, tempdirs=[tmp_dir]) logging.debug("Resorting chimeras by name") def sort_chimeras_by_name(line): return line.strip().split('\t', Chimera.NAME_FIELD + 1)[Chimera.NAME_FIELD] sorted_chimera_file = os.path.join(tmp_dir, "spanning_chimeras.name_sorted.txt") batch_sort(input=input_file, output=sorted_chimera_file, key=sort_chimeras_by_name, buffer_size=32000, tempdirs=[tmp_dir]) # # parse and rebuild chimeras based on best reads # logging.debug("Rewriting chimeras with lists of 'best' reads") f = open(output_file, "w") # need to sync chimeras with stats for c, stats in parse_sync_chimeras_read_stats( sorted_chimera_file, sorted_resolved_read_stats_file): # parse and make lookup set of the resolved alignments good_alignments = set() for s in stats: if s.isize_prob < min_isize_prob: continue good_alignments.add((s.qname, s.tid5p, s.pos5p, s.tid3p, s.pos3p)) # replace encompassing frags with resolved alignments new_encomp_frags = [] for dpair in c.encomp_frags: # get alignment tuple aln = (dpair[0].qname, dpair[0].tid, dpair[0].pos, dpair[1].tid, dpair[1].pos) if aln in good_alignments: new_encomp_frags.append(dpair) c.encomp_frags = new_encomp_frags c.score = c.get_num_frags() print >> f, '\t'.join(map(str, c.to_list())) f.close()
def resolve_discordant_reads(input_file, output_file, isize_dist, min_isize_prob, tmp_dir): # # parse chimeras and output reads to a file # logging.debug("Getting discordant read information") read_stats_file = os.path.join(tmp_dir, "read_stats.txt") make_discordant_read_stats_file(input_file, read_stats_file, isize_dist) # # now sort the read/chimera stats list # logging.debug("Sorting reads by read name") sorted_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.txt") sort_read_stats_by_read_name(read_stats_file, sorted_read_stats_file, tmp_dir) # # parse reads by read name # logging.debug("Choosing best read groups") resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.rname_sorted.resolved.txt") f = open(resolved_read_stats_file, "w") for rname,readstats in group_by_attr(ChimeraStats.parse(open(sorted_read_stats_file)), 'qname'): # build a dictionary of stats -> read/chimeras stats_dict = collections.defaultdict(lambda: []) for s in readstats: # add key/value pairs stats_dict[s.score_tuple].append(s) # sort based on stats sorted_stats_keys = sorted(stats_dict.keys(), reverse=True) # use only the best key for s in stats_dict[sorted_stats_keys[0]]: # output read -> chimera relationships print >>f, '\t'.join(map(str, s.to_list())) f.close() # # re-sort by chimera name # logging.debug("Resorting reads by chimera name") def sort_reads_by_chimera_name(line): return line.strip().split('\t',CHIMERA_NAME_COL+1)[CHIMERA_NAME_COL] sorted_resolved_read_stats_file = os.path.join(tmp_dir, "read_stats.chimera_name_sorted.resolved.txt") batch_sort(input=resolved_read_stats_file, output=sorted_resolved_read_stats_file, key=sort_reads_by_chimera_name, buffer_size=32000, tempdirs=[tmp_dir]) logging.debug("Resorting chimeras by name") def sort_chimeras_by_name(line): return line.strip().split('\t',Chimera.NAME_FIELD+1)[Chimera.NAME_FIELD] sorted_chimera_file = os.path.join(tmp_dir, "spanning_chimeras.name_sorted.txt") batch_sort(input=input_file, output=sorted_chimera_file, key=sort_chimeras_by_name, buffer_size=32000, tempdirs=[tmp_dir]) # # parse and rebuild chimeras based on best reads # logging.debug("Rewriting chimeras with lists of 'best' reads") f = open(output_file, "w") # need to sync chimeras with stats for c,stats in parse_sync_chimeras_read_stats(sorted_chimera_file, sorted_resolved_read_stats_file): # parse and make lookup set of the resolved alignments good_alignments = set() for s in stats: if s.isize_prob < min_isize_prob: continue good_alignments.add((s.qname, s.tid5p, s.pos5p, s.tid3p, s.pos3p)) # replace encompassing frags with resolved alignments new_encomp_frags = [] for dpair in c.encomp_frags: # get alignment tuple aln = (dpair[0].qname, dpair[0].tid, dpair[0].pos, dpair[1].tid, dpair[1].pos) if aln in good_alignments: new_encomp_frags.append(dpair) c.encomp_frags = new_encomp_frags c.score = c.get_num_frags() print >>f, '\t'.join(map(str, c.to_list())) f.close()
def nominate_single_mapped_spanning_reads(chimera_file, single_mapped_bam_file, single_mapped_fastq_file, tmp_dir): # find sequences that could cross a breakpoint tmp_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.txt") f = open(tmp_seqs_to_remap, "w") # search for matches to 5' chimeras logging.debug("Matching single-mapped frags to 5' chimeras") for clist, reads in parse_sync_chimera_with_bam(chimera_file, single_mapped_bam_file, OrientationTags.FIVEPRIME): # TODO: test more specifically that read has a chance to cross breakpoint for r in reads: # reverse read number readnum = 1 if r.is_read1 else 0 print >> f, '\t'.join( map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")])) # sort chimeras by 3' partner logging.debug("Sorting chimeras by 3' transcript") def sort_by_3p_partner(line): fields = line.strip().split('\t', Chimera.TX_NAME_3P_FIELD + 1) return fields[Chimera.TX_NAME_3P_FIELD] tmp_chimera_file_sorted_3p = os.path.join(tmp_dir, "tmp_chimeras.sorted3p.bedpe") batch_sort(input=chimera_file, output=tmp_chimera_file_sorted_3p, key=sort_by_3p_partner, buffer_size=32000, tempdirs=[tmp_dir]) # search for matches to 3' chimeras logging.debug("Matching single-mapped frags to 3' chimeras") for clist, reads in parse_sync_chimera_with_bam( tmp_chimera_file_sorted_3p, single_mapped_bam_file, OrientationTags.THREEPRIME): # TODO: test more specifically that read has a chance to cross breakpoint for r in reads: # reverse read number readnum = 1 if r.is_read1 else 0 print >> f, '\t'.join( map(str, [r.qname, readnum, r.opt("R2"), r.opt("Q2")])) f.close() # # now sort the file of sequences by read name/number to # eliminate duplicates # def sort_by_qname(line): fields = line.strip().split('\t') return (fields[0], int(fields[1])) tmp_sorted_seqs_to_remap = os.path.join(tmp_dir, "tmp_singlemap_seqs.sorted.txt") batch_sort(input=tmp_seqs_to_remap, output=tmp_sorted_seqs_to_remap, key=sort_by_qname, buffer_size=32000, tempdirs=[tmp_dir]) # # read file and write fastq, ignoring duplicates # fqfh = open(single_mapped_fastq_file, "w") prev = None for line in open(tmp_sorted_seqs_to_remap): fields = line.strip().split('\t') qname, readnum, seq, qual = fields[0], int( fields[1]), fields[2], fields[3] cur = (fields[0], int(fields[1])) if prev != cur: if prev is not None: print >> fqfh, to_fastq(qname, readnum, seq, qual) prev = cur if prev is not None: print >> fqfh, to_fastq(qname, readnum, seq, qual) fqfh.close() # TODO: remove temporary files #os.remove(tmp_chimera_file_sorted_3p) #os.remove(tmp_seqs_to_remap) #os.remove(tmp_sorted_seqs_to_remap) return config.JOB_SUCCESS