Exemple #1
0
def write_output(transcripts,
                 cluster_shelve_file,
                 cluster_pair_file,
                 read_name_file,
                 output_file,
                 annotation_source="ensembl"):
    # load cluster and read name database files
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    read_name_fh = open(read_name_file, 'r')
    # map genome coordinates to transcripts
    logging.debug(
        "Creating mapping between genome coordinates and transcripts")
    transcript_dict, genome_tx_trees = build_genome_transcript_trees(
        transcripts)
    logging.debug("Writing output")
    outfh = open(output_file, "w")
    print >> outfh, '#' + '\t'.join(Chimera._fields)
    for cluster_pair in parse_discordant_cluster_pair_file(
            open(cluster_pair_file)):
        c = make_chimera(cluster_pair, cluster_shelve, transcript_dict,
                         genome_tx_trees, annotation_source)
        print >> outfh, str(c)
    # cleanup
    outfh.close()
    read_name_fh.close()
    cluster_shelve.close()
    return config.JOB_SUCCESS
def process_spanning_alignments(cluster_shelve_file, 
                                cluster_pair_file,
                                bam_file, 
                                output_sam_file,
                                output_cluster_pair_file,
                                local_anchor_length):
    # load cluster database file
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    # parse breakpoint alignments and output spanning reads
    bamfh = pysam.Samfile(bam_file, "rb")
    outsamfh = pysam.Samfile(output_sam_file, "wh", template=bamfh)
    outfh = open(output_cluster_pair_file, "w")
    cluster_pair_iter = parse_discordant_cluster_pair_file(open(cluster_pair_file))
    # get cluster reads from BAM file
    num_spanning_reads = 0
    for pair_id, cluster_reads in _parse_bam_by_cluster_pair(bamfh):
        # synch with cluster pair file
        cluster_pair = cluster_pair_iter.next()
        while pair_id != cluster_pair.pair_id:
            # no spanning reads here
            print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, 
                                               cluster_pair.id5p, 
                                               cluster_pair.id3p, 
                                               ','.join(cluster_pair.qnames),
                                               '']))            
            cluster_pair = cluster_pair_iter.next()
        # get spanning read alignments
        spanning_reads = nominate_spanning_reads(cluster_pair, 
                                                 cluster_shelve, 
                                                 bamfh,
                                                 cluster_reads,
                                                 local_anchor_length)
        spanning_qnames = sorted(set(r5p.qname for r5p,r3p in spanning_reads))
        # write new cluster pair file
        print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, 
                                           cluster_pair.id5p, 
                                           cluster_pair.id3p, 
                                           ','.join(cluster_pair.qnames),
                                           ','.join(spanning_qnames)]))
        # write spanning reads to SAM file
        for r5p,r3p in spanning_reads:
            outsamfh.write(r5p)
            outsamfh.write(r3p)
        num_spanning_reads += len(spanning_reads)
    # finish outputting remaining clusters
    for cluster_pair in cluster_pair_iter:
        print >>outfh, '\t'.join(map(str, [cluster_pair.pair_id, 
                                           cluster_pair.id5p, 
                                           cluster_pair.id3p, 
                                           ','.join(cluster_pair.qnames), 
                                           '']))
    logging.debug("\tFound %d spanning read alignments" % (num_spanning_reads))
    outsamfh.close()
    outfh.close()
    bamfh.close()
    cluster_shelve.close()
    return config.JOB_SUCCESS
def process_spanning_alignments(cluster_shelve_file, cluster_pair_file,
                                bam_file, output_sam_file,
                                output_cluster_pair_file, local_anchor_length):
    # load cluster database file
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    # parse breakpoint alignments and output spanning reads
    bamfh = pysam.Samfile(bam_file, "rb")
    outsamfh = pysam.Samfile(output_sam_file, "wh", template=bamfh)
    outfh = open(output_cluster_pair_file, "w")
    cluster_pair_iter = parse_discordant_cluster_pair_file(
        open(cluster_pair_file))
    # get cluster reads from BAM file
    num_spanning_reads = 0
    for pair_id, cluster_reads in _parse_bam_by_cluster_pair(bamfh):
        # synch with cluster pair file
        cluster_pair = cluster_pair_iter.next()
        while pair_id != cluster_pair.pair_id:
            # no spanning reads here
            print >> outfh, '\t'.join(
                map(str, [
                    cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p,
                    ','.join(cluster_pair.qnames), ''
                ]))
            cluster_pair = cluster_pair_iter.next()
        # get spanning read alignments
        spanning_reads = nominate_spanning_reads(cluster_pair, cluster_shelve,
                                                 bamfh, cluster_reads,
                                                 local_anchor_length)
        spanning_qnames = sorted(set(r5p.qname for r5p, r3p in spanning_reads))
        # write new cluster pair file
        print >> outfh, '\t'.join(
            map(str, [
                cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p,
                ','.join(cluster_pair.qnames), ','.join(spanning_qnames)
            ]))
        # write spanning reads to SAM file
        for r5p, r3p in spanning_reads:
            outsamfh.write(r5p)
            outsamfh.write(r3p)
        num_spanning_reads += len(spanning_reads)
    # finish outputting remaining clusters
    for cluster_pair in cluster_pair_iter:
        print >> outfh, '\t'.join(
            map(str, [
                cluster_pair.pair_id, cluster_pair.id5p, cluster_pair.id3p,
                ','.join(cluster_pair.qnames), ''
            ]))
    logging.debug("\tFound %d spanning read alignments" % (num_spanning_reads))
    outsamfh.close()
    outfh.close()
    bamfh.close()
    cluster_shelve.close()
    return config.JOB_SUCCESS
def realign_across_breakpoints(index_dir, 
                               discordant_bam_file,
                               unpaired_bam_file,
                               cluster_shelve_file, 
                               cluster_pair_file, 
                               breakpoint_bam_file,
                               log_dir,
                               tmp_dir,
                               num_processors,
                               local_anchor_length,
                               local_multihits):
    # load cluster database file
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    # open discordant reads file
    discordant_bamfh = pysam.Samfile(discordant_bam_file, "rb")
    unpaired_bamfh = pysam.Samfile(unpaired_bam_file, "rb")
    # create tmp dir if it does not exist
    fastq_file = os.path.join(tmp_dir, config.BREAKPOINT_FASTQ_FILE)
    fastq_fh = open(fastq_file, 'w')
    # iterate through cluster pairs and get breakpoint reads
    logging.debug("Extracting breakpoint spanning sequences")
    num_seqs = 0
    for cluster_pair in parse_discordant_cluster_pair_file(open(cluster_pair_file)):
        for fastq_line in _get_cluster_breakpoint_fastq(cluster_pair, 
                                                        cluster_shelve, 
                                                        discordant_bamfh, 
                                                        unpaired_bamfh):
            print >>fastq_fh, fastq_line
            num_seqs += 1
    fastq_fh.close()        
    discordant_bamfh.close()
    unpaired_bamfh.close()
    logging.debug("\tFound %d putative breakpoint spanning sequences" % (num_seqs))
    # use bowtie2 local alignment to find spanning reads 
    transcriptome_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX)
    genome_index = os.path.join(index_dir, config.GENOME_INDEX)
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    log_file = os.path.join(log_dir, config.BREAKPOINT_LOG_FILE)
    logging.debug("Realigning breakpoint spanning sequences")
    bowtie2_align_local(transcriptome_index,
                        genome_index,
                        transcript_file,                                   
                        fastq_file,
                        breakpoint_bam_file,
                        log_file,
                        local_anchor_length=local_anchor_length,
                        local_multihits=local_multihits,
                        num_processors=num_processors)
    cluster_shelve.close()
    return config.JOB_SUCCESS
def realign_across_breakpoints(index_dir, discordant_bam_file,
                               unpaired_bam_file, cluster_shelve_file,
                               cluster_pair_file, breakpoint_bam_file, log_dir,
                               tmp_dir, num_processors, local_anchor_length,
                               local_multihits):
    # load cluster database file
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    # open discordant reads file
    discordant_bamfh = pysam.Samfile(discordant_bam_file, "rb")
    unpaired_bamfh = pysam.Samfile(unpaired_bam_file, "rb")
    # create tmp dir if it does not exist
    fastq_file = os.path.join(tmp_dir, config.BREAKPOINT_FASTQ_FILE)
    fastq_fh = open(fastq_file, 'w')
    # iterate through cluster pairs and get breakpoint reads
    logging.debug("Extracting breakpoint spanning sequences")
    num_seqs = 0
    for cluster_pair in parse_discordant_cluster_pair_file(
            open(cluster_pair_file)):
        for fastq_line in _get_cluster_breakpoint_fastq(
                cluster_pair, cluster_shelve, discordant_bamfh,
                unpaired_bamfh):
            print >> fastq_fh, fastq_line
            num_seqs += 1
    fastq_fh.close()
    discordant_bamfh.close()
    unpaired_bamfh.close()
    logging.debug("\tFound %d putative breakpoint spanning sequences" %
                  (num_seqs))
    # use bowtie2 local alignment to find spanning reads
    transcriptome_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX)
    genome_index = os.path.join(index_dir, config.GENOME_INDEX)
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    log_file = os.path.join(log_dir, config.BREAKPOINT_LOG_FILE)
    logging.debug("Realigning breakpoint spanning sequences")
    bowtie2_align_local(transcriptome_index,
                        genome_index,
                        transcript_file,
                        fastq_file,
                        breakpoint_bam_file,
                        log_file,
                        local_anchor_length=local_anchor_length,
                        local_multihits=local_multihits,
                        num_processors=num_processors)
    cluster_shelve.close()
    return config.JOB_SUCCESS
Exemple #6
0
def write_output(transcripts, cluster_shelve_file, cluster_pair_file, 
                 read_name_file, output_file, 
                 annotation_source="ensembl"):
    # load cluster and read name database files
    cluster_shelve = shelve.open(cluster_shelve_file, 'r')
    read_name_fh = open(read_name_file, 'r')   
    # map genome coordinates to transcripts
    logging.debug("Creating mapping between genome coordinates and transcripts")
    transcript_dict, genome_tx_trees = build_genome_transcript_trees(transcripts)
    logging.debug("Writing output")
    outfh = open(output_file, "w")
    print >>outfh, '#' + '\t'.join(Chimera._fields)
    for cluster_pair in parse_discordant_cluster_pair_file(open(cluster_pair_file)):
        c = make_chimera(cluster_pair, cluster_shelve, transcript_dict, 
                         genome_tx_trees, annotation_source)
        print >>outfh, str(c)
    # cleanup
    outfh.close()
    read_name_fh.close()
    cluster_shelve.close()
    return config.JOB_SUCCESS