def write_output(transcripts, cluster_shelve_file, cluster_pair_file, read_name_file, output_file, annotation_source="ensembl"): # load cluster and read name database files cluster_shelve = shelve.open(cluster_shelve_file, 'r') read_name_fh = open(read_name_file, 'r') # map genome coordinates to transcripts logging.debug( "Creating mapping between genome coordinates and transcripts") transcript_dict, genome_tx_trees = build_genome_transcript_trees( transcripts) logging.debug("Writing output") outfh = open(output_file, "w") print >> outfh, '#' + '\t'.join(Chimera._fields) for cluster_pair in parse_discordant_cluster_pair_file( open(cluster_pair_file)): c = make_chimera(cluster_pair, cluster_shelve, transcript_dict, genome_tx_trees, annotation_source) print >> outfh, str(c) # cleanup outfh.close() read_name_fh.close() cluster_shelve.close() return config.JOB_SUCCESS
def process_spanning_alignments(cluster_shelve_file, cluster_pair_file, bam_file, output_sam_file, output_cluster_pair_file, local_anchor_length): # load cluster database file cluster_shelve = shelve.open(cluster_shelve_file, 'r') # parse breakpoint alignments and output spanning reads bamfh = pysam.Samfile(bam_file, "rb") outsamfh = pysam.Samfile(output_sam_file, "wh", template=bamfh) outfh = open(output_cluster_pair_file, "w") cluster_pair_iter = parse_discordant_cluster_pair_file(open(cluster_pair_file)) # get cluster reads from BAM file num_spanning_reads = 0 for pair_id, cluster_reads in _parse_bam_by_cluster_pair(bamfh): # synch with cluster pair file cluster_pair = cluster_pair_iter.next() while pair_id != cluster_pair.pair_id: # no spanning reads here print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), ''])) cluster_pair = cluster_pair_iter.next() # get spanning read alignments spanning_reads = nominate_spanning_reads(cluster_pair, cluster_shelve, bamfh, cluster_reads, local_anchor_length) spanning_qnames = sorted(set(r5p.qname for r5p,r3p in spanning_reads)) # write new cluster pair file print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), ','.join(spanning_qnames)])) # write spanning reads to SAM file for r5p,r3p in spanning_reads: outsamfh.write(r5p) outsamfh.write(r3p) num_spanning_reads += len(spanning_reads) # finish outputting remaining clusters for cluster_pair in cluster_pair_iter: print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), ''])) logging.debug("\tFound %d spanning read alignments" % (num_spanning_reads)) outsamfh.close() outfh.close() bamfh.close() cluster_shelve.close() return config.JOB_SUCCESS
def process_spanning_alignments(cluster_shelve_file, cluster_pair_file, bam_file, output_sam_file, output_cluster_pair_file, local_anchor_length): # load cluster database file cluster_shelve = shelve.open(cluster_shelve_file, 'r') # parse breakpoint alignments and output spanning reads bamfh = pysam.Samfile(bam_file, "rb") outsamfh = pysam.Samfile(output_sam_file, "wh", template=bamfh) outfh = open(output_cluster_pair_file, "w") cluster_pair_iter = parse_discordant_cluster_pair_file( open(cluster_pair_file)) # get cluster reads from BAM file num_spanning_reads = 0 for pair_id, cluster_reads in _parse_bam_by_cluster_pair(bamfh): # synch with cluster pair file cluster_pair = cluster_pair_iter.next() while pair_id != cluster_pair.pair_id: # no spanning reads here print >> outfh, '\t'.join( map(str, [ cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), '' ])) cluster_pair = cluster_pair_iter.next() # get spanning read alignments spanning_reads = nominate_spanning_reads(cluster_pair, cluster_shelve, bamfh, cluster_reads, local_anchor_length) spanning_qnames = sorted(set(r5p.qname for r5p, r3p in spanning_reads)) # write new cluster pair file print >> outfh, '\t'.join( map(str, [ cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), ','.join(spanning_qnames) ])) # write spanning reads to SAM file for r5p, r3p in spanning_reads: outsamfh.write(r5p) outsamfh.write(r3p) num_spanning_reads += len(spanning_reads) # finish outputting remaining clusters for cluster_pair in cluster_pair_iter: print >> outfh, '\t'.join( map(str, [ cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p, ','.join(cluster_pair.qnames), '' ])) logging.debug("\tFound %d spanning read alignments" % (num_spanning_reads)) outsamfh.close() outfh.close() bamfh.close() cluster_shelve.close() return config.JOB_SUCCESS
def realign_across_breakpoints(index_dir, discordant_bam_file, unpaired_bam_file, cluster_shelve_file, cluster_pair_file, breakpoint_bam_file, log_dir, tmp_dir, num_processors, local_anchor_length, local_multihits): # load cluster database file cluster_shelve = shelve.open(cluster_shelve_file, 'r') # open discordant reads file discordant_bamfh = pysam.Samfile(discordant_bam_file, "rb") unpaired_bamfh = pysam.Samfile(unpaired_bam_file, "rb") # create tmp dir if it does not exist fastq_file = os.path.join(tmp_dir, config.BREAKPOINT_FASTQ_FILE) fastq_fh = open(fastq_file, 'w') # iterate through cluster pairs and get breakpoint reads logging.debug("Extracting breakpoint spanning sequences") num_seqs = 0 for cluster_pair in parse_discordant_cluster_pair_file(open(cluster_pair_file)): for fastq_line in _get_cluster_breakpoint_fastq(cluster_pair, cluster_shelve, discordant_bamfh, unpaired_bamfh): print >>fastq_fh, fastq_line num_seqs += 1 fastq_fh.close() discordant_bamfh.close() unpaired_bamfh.close() logging.debug("\tFound %d putative breakpoint spanning sequences" % (num_seqs)) # use bowtie2 local alignment to find spanning reads transcriptome_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX) genome_index = os.path.join(index_dir, config.GENOME_INDEX) transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) log_file = os.path.join(log_dir, config.BREAKPOINT_LOG_FILE) logging.debug("Realigning breakpoint spanning sequences") bowtie2_align_local(transcriptome_index, genome_index, transcript_file, fastq_file, breakpoint_bam_file, log_file, local_anchor_length=local_anchor_length, local_multihits=local_multihits, num_processors=num_processors) cluster_shelve.close() return config.JOB_SUCCESS
def realign_across_breakpoints(index_dir, discordant_bam_file, unpaired_bam_file, cluster_shelve_file, cluster_pair_file, breakpoint_bam_file, log_dir, tmp_dir, num_processors, local_anchor_length, local_multihits): # load cluster database file cluster_shelve = shelve.open(cluster_shelve_file, 'r') # open discordant reads file discordant_bamfh = pysam.Samfile(discordant_bam_file, "rb") unpaired_bamfh = pysam.Samfile(unpaired_bam_file, "rb") # create tmp dir if it does not exist fastq_file = os.path.join(tmp_dir, config.BREAKPOINT_FASTQ_FILE) fastq_fh = open(fastq_file, 'w') # iterate through cluster pairs and get breakpoint reads logging.debug("Extracting breakpoint spanning sequences") num_seqs = 0 for cluster_pair in parse_discordant_cluster_pair_file( open(cluster_pair_file)): for fastq_line in _get_cluster_breakpoint_fastq( cluster_pair, cluster_shelve, discordant_bamfh, unpaired_bamfh): print >> fastq_fh, fastq_line num_seqs += 1 fastq_fh.close() discordant_bamfh.close() unpaired_bamfh.close() logging.debug("\tFound %d putative breakpoint spanning sequences" % (num_seqs)) # use bowtie2 local alignment to find spanning reads transcriptome_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX) genome_index = os.path.join(index_dir, config.GENOME_INDEX) transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) log_file = os.path.join(log_dir, config.BREAKPOINT_LOG_FILE) logging.debug("Realigning breakpoint spanning sequences") bowtie2_align_local(transcriptome_index, genome_index, transcript_file, fastq_file, breakpoint_bam_file, log_file, local_anchor_length=local_anchor_length, local_multihits=local_multihits, num_processors=num_processors) cluster_shelve.close() return config.JOB_SUCCESS
def write_output(transcripts, cluster_shelve_file, cluster_pair_file, read_name_file, output_file, annotation_source="ensembl"): # load cluster and read name database files cluster_shelve = shelve.open(cluster_shelve_file, 'r') read_name_fh = open(read_name_file, 'r') # map genome coordinates to transcripts logging.debug("Creating mapping between genome coordinates and transcripts") transcript_dict, genome_tx_trees = build_genome_transcript_trees(transcripts) logging.debug("Writing output") outfh = open(output_file, "w") print >>outfh, '#' + '\t'.join(Chimera._fields) for cluster_pair in parse_discordant_cluster_pair_file(open(cluster_pair_file)): c = make_chimera(cluster_pair, cluster_shelve, transcript_dict, genome_tx_trees, annotation_source) print >>outfh, str(c) # cleanup outfh.close() read_name_fh.close() cluster_shelve.close() return config.JOB_SUCCESS