Beispiel #1
0
def run_chimerascan(runconfig):
    # normal run
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        sys.exit(JOB_ERROR)
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # gather and parse run parameters
    library_type = parse_library_type(runconfig.library_type)
    gene_feature_file = os.path.join(runconfig.index_dir,
                                     config.GENE_FEATURE_FILE)
    bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n"
    bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX)
    original_read_length = get_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Initial Bowtie alignment step
    #
    # align in paired-end mode, trying to resolve as many reads as possible
    # this effectively rules out the vast majority of reads as candidate
    # fusions
    unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM)
    maxmultimap_fastq_param = os.path.join(tmp_dir,
                                           config.MAXMULTIMAP_FASTQ_PARAM)
    aligned_bam_file = os.path.join(runconfig.output_dir,
                                    config.ALIGNED_READS_BAM_FILE)
    aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log")
    if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Alignment results exist")
    else:
        logging.info("Aligning full-length reads in paired-end mode")
        retcode = align_pe_full(
            runconfig.fastq_files,
            bowtie_index,
            aligned_bam_file,
            unaligned_fastq_param,
            maxmultimap_fastq_param,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            trim5=runconfig.trim5,
            trim3=runconfig.trim3,
            library_type=runconfig.library_type,
            num_processors=runconfig.num_processors,
            fastq_format=runconfig.fastq_format,
            multihits=runconfig.multihits,
            mismatches=runconfig.mismatches,
            bowtie_bin=runconfig.bowtie_bin,
            bowtie_mode=bowtie_mode,
            log_file=aligned_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    isize_dist = InsertSizeDistribution()
    if up_to_date(isize_dist_file, aligned_bam_file):
        logging.info("[SKIPPED] Profiling insert size distribution")
        isize_dist.from_file(open(isize_dist_file, "r"))
    else:
        logging.info("Profiling insert size distribution")
        max_isize_samples = config.ISIZE_MAX_SAMPLES
        bamfh = pysam.Samfile(aligned_bam_file, "rb")
        isize_dist.from_bam(bamfh,
                            min_isize=min_fragment_length,
                            max_isize=runconfig.max_fragment_length,
                            max_samples=max_isize_samples)
        isize_dist.to_file(open(isize_dist_file, "w"))
        bamfh.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.percentile(50.0), isize_dist.mode()))
    #
    # Discordant reads alignment step
    #
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    discordant_log_file = os.path.join(log_dir,
                                       "bowtie_segmented_alignment.log")
    unaligned_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES
    ]
    # get the segments used in discordant alignment to know the effective
    # read length used to align.  we used this to set the 'padding' during
    # spanning read discovery
    segments = determine_read_segments(original_read_length,
                                       segment_length=runconfig.segment_length,
                                       segment_trim=True,
                                       trim5=runconfig.trim5,
                                       trim3=runconfig.trim3)
    segmented_read_length = segments[-1][1]
    logging.debug("Segmented alignment will use effective read length of %d" %
                  (segmented_read_length))
    if all(
            up_to_date(discordant_bam_file, fq)
            for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Discordant alignment results exist")
    else:
        logging.info("Aligning initially unmapped reads in single read mode")
        align(unaligned_fastq_files,
              runconfig.fastq_format,
              bowtie_index,
              discordant_bam_file,
              bowtie_bin=runconfig.bowtie_bin,
              num_processors=runconfig.num_processors,
              segment_length=runconfig.segment_length,
              segment_trim=True,
              trim5=runconfig.trim5,
              trim3=runconfig.trim3,
              multihits=runconfig.multihits,
              mismatches=runconfig.mismatches,
              bowtie_mode=bowtie_mode,
              best_strata=runconfig.best_strata,
              log_file=discordant_log_file)
    #
    # Merge paired-end reads step
    #
    paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE)
    if up_to_date(paired_bam_file, discordant_bam_file):
        logging.info("[SKIPPED] Read pairing results exist")
    else:
        logging.info("Pairing aligned reads")
        bamfh = pysam.Samfile(discordant_bam_file, "rb")
        paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
        merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length,
                         runconfig.max_fragment_length, library_type)
        paired_bamfh.close()
        bamfh.close()
    #
    # Find discordant reads step
    #
    discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE)
    discordant_genome_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE)
    padding = original_read_length - segmented_read_length
    if (up_to_date(discordant_gene_bedpe_file, paired_bam_file)
            and up_to_date(discordant_genome_bedpe_file, paired_bam_file)):
        logging.info("[SKIPPED] Finding discordant reads")
    else:
        logging.info("Finding discordant reads")
        bamfh = pysam.Samfile(paired_bam_file, "rb")
        find_discordant_reads(bamfh,
                              discordant_gene_bedpe_file,
                              discordant_genome_bedpe_file,
                              gene_feature_file,
                              max_indel_size=runconfig.max_indel_size,
                              max_isize=runconfig.max_fragment_length,
                              max_multihits=runconfig.multihits,
                              library_type=library_type,
                              padding=padding)
        bamfh.close()
    #
    # Extract full sequences of the discordant reads
    #
    extended_discordant_gene_bedpe_file = \
        os.path.join(tmp_dir,
                     config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE)
    if up_to_date(extended_discordant_gene_bedpe_file,
                  discordant_gene_bedpe_file):
        logging.info(
            "[SKIPPED] Retrieving full length sequences for realignment")
    else:
        logging.info("Retrieving full length sequences for realignment")
        extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file,
                         extended_discordant_gene_bedpe_file)
    #
    # Sort discordant reads
    #
    sorted_discordant_gene_bedpe_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE)
    if (up_to_date(sorted_discordant_gene_bedpe_file,
                   extended_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Sorting discordant BEDPE file")
    else:
        logging.info("Sorting discordant BEDPE file")
        sort_discordant_reads(extended_discordant_gene_bedpe_file,
                              sorted_discordant_gene_bedpe_file)
    #
    # Nominate chimeras step
    #
    encompassing_bedpe_file = os.path.join(
        tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(encompassing_bedpe_file,
                   sorted_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Nominating chimeras from discordant reads")
    else:
        logging.info("Nominating chimeras from discordant reads")
        nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"),
                          open(encompassing_bedpe_file, "w"),
                          gene_feature_file,
                          trim=config.EXON_JUNCTION_TRIM_BP)
    #
    # Filter encompassing chimeras step
    #
    filtered_encomp_bedpe_file = \
        os.path.join(tmp_dir,
                     config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)):
        logging.info("[SKIPPED] Filtering encompassing chimeras")
    else:
        logging.info("Filtering encompassing chimeras")
        # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std
        filter_encompassing_chimeras(
            encompassing_bedpe_file,
            filtered_encomp_bedpe_file,
            gene_feature_file,
            max_multimap=runconfig.filter_max_multimaps,
            multimap_cov_ratio=runconfig.filter_multimap_ratio,
            max_isize=-1,
            strand_pval=runconfig.filter_strand_pval)
    #
    # Nominate spanning reads step
    #
    spanning_fastq_file = os.path.join(runconfig.output_dir,
                                       config.SPANNING_FASTQ_FILE)
    if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files):
        logging.info("[SKIPPED] Preparing junction spanning reads")
    else:
        logging.info("Preparing junction spanning reads")
        outfh = open(spanning_fastq_file, "w")
        for f in unaligned_fastq_files:
            shutil.copyfileobj(open(f), outfh)
        outfh.close()
    # TODO: skip this step for now, and simply realign all the reads


#    spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE)
#    if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and
#        up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)):
#        logging.info("[SKIPPED] Nominating junction spanning reads")
#    else:
#        logging.info("Nominating junction spanning reads")
#        nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'),
#                                open(filtered_encomp_bedpe_file, 'r'),
#                                open(spanning_fastq_file, 'w'))
#
# Extract junction sequences from chimeras file
#
    ref_fasta_file = os.path.join(runconfig.index_dir,
                                  config.ALIGN_INDEX + ".fa")
    junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE)
    junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE)
    spanning_read_length = get_read_length(spanning_fastq_file)
    if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file)
            and up_to_date(junc_map_file, filtered_encomp_bedpe_file)):
        logging.info("[SKIPPED] Extracting junction read sequences")
    else:
        logging.info("Extracting junction read sequences")
        bedpe_to_junction_fasta(filtered_encomp_bedpe_file,
                                ref_fasta_file, spanning_read_length,
                                open(junc_fasta_file, "w"),
                                open(junc_map_file, "w"))
    #
    # Build a bowtie index to align and detect spanning reads
    #
    bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX)
    bowtie_spanning_index_file = os.path.join(tmp_dir,
                                              config.JUNC_BOWTIE_INDEX_FILE)
    if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)):
        logging.info(
            "[SKIPPED] Building bowtie index for junction-spanning reads")
    else:
        logging.info("Building bowtie index for junction-spanning reads")
        args = [
            runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index
        ]
        f = open(os.path.join(log_dir, "bowtie_build.log"), "w")
        subprocess.call(args, stdout=f, stderr=f)
        f.close()
    #
    # Align unmapped reads across putative junctions
    #
    junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE)
    junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log")
    if (up_to_date(junc_bam_file, bowtie_spanning_index_file)
            and up_to_date(junc_bam_file, spanning_fastq_file)):
        logging.info("[SKIPPED] Aligning junction spanning reads")
    else:
        logging.info("Aligning junction spanning reads")
        retcode = align_sr_full(spanning_fastq_file,
                                bowtie_spanning_index,
                                junc_bam_file,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=junc_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Merge spanning and encompassing read information
    #
    raw_chimera_bedpe_file = os.path.join(tmp_dir,
                                          config.RAW_CHIMERA_BEDPE_FILE)
    if (up_to_date(raw_chimera_bedpe_file, junc_bam_file)
            and up_to_date(raw_chimera_bedpe_file, junc_map_file)):
        logging.info(
            "[SKIPPED] Merging spanning and encompassing read alignments")
    else:
        logging.info("Merging spanning and encompassing read alignments")
        merge_spanning_alignments(junc_bam_file,
                                  junc_map_file,
                                  raw_chimera_bedpe_file,
                                  anchor_min=0,
                                  anchor_max=0,
                                  anchor_mismatches=0)
    #
    # Choose best isoform for each junction
    #
    chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE)
    if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)):
        logging.info("[SKIPPED] Filtering chimeras")
    else:
        logging.info("Filtering chimeras")
        # get insert size at prob
        max_isize = isize_dist.percentile(runconfig.filter_isize_percentile)
        filter_spanning_chimeras(raw_chimera_bedpe_file,
                                 chimera_bedpe_file,
                                 gene_feature_file,
                                 mate_pval=runconfig.filter_strand_pval,
                                 max_isize=max_isize)
    #
    # Rank chimeras
    #
    ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                             config.RANKED_CHIMERA_BEDPE_FILE)
    if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)):
        logging.info("[SKIPPED] Ranking chimeras")
    else:
        logging.info("Ranking chimeras")
        rank_chimeras(chimera_bedpe_file,
                      ranked_chimera_bedpe_file,
                      empirical_prob=runconfig.empirical_prob)
    #
    # Cleanup
    #
    #shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run. Chimeras written to file %s" %
                 (ranked_chimera_bedpe_file))
    return JOB_SUCCESS
def run_chimerascan(runconfig):
    # normal run
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        sys.exit(JOB_ERROR)
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))        
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE)
    fh = open(runconfig_xml_file, "w")
    print >>fh, xmlstring
    fh.close()
    # gather and parse run parameters
    library_type = parse_library_type(runconfig.library_type)    
    gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE)
    bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n"
    bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX)
    original_read_length = get_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3
    min_fragment_length = max(runconfig.min_fragment_length, 
                              trimmed_read_length)
    #
    # Initial Bowtie alignment step
    #
    # align in paired-end mode, trying to resolve as many reads as possible
    # this effectively rules out the vast majority of reads as candidate
    # fusions
    unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM)
    maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM)
    aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE)
    aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log")    
    if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Alignment results exist")
    else:    
        logging.info("Aligning full-length reads in paired-end mode")
        retcode = align_pe_full(runconfig.fastq_files, 
                                bowtie_index,
                                aligned_bam_file, 
                                unaligned_fastq_param,
                                maxmultimap_fastq_param,
                                min_fragment_length=min_fragment_length,
                                max_fragment_length=runconfig.max_fragment_length,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,
                                library_type=runconfig.library_type,
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=aligned_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))    
            sys.exit(retcode)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE)
    isize_dist = InsertSizeDistribution()
    if up_to_date(isize_dist_file, aligned_bam_file):
        logging.info("[SKIPPED] Profiling insert size distribution")
        isize_dist.from_file(open(isize_dist_file, "r"))
    else:
        logging.info("Profiling insert size distribution")
        max_isize_samples = config.ISIZE_MAX_SAMPLES
        bamfh = pysam.Samfile(aligned_bam_file, "rb")
        isize_dist.from_bam(bamfh, min_isize=min_fragment_length, 
                            max_isize=runconfig.max_fragment_length, 
                            max_samples=max_isize_samples)
        isize_dist.to_file(open(isize_dist_file, "w"))
        bamfh.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(), 
                  isize_dist.percentile(50.0), isize_dist.mode()))
    #
    # Discordant reads alignment step
    #
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log")    
    unaligned_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES]
    # get the segments used in discordant alignment to know the effective
    # read length used to align.  we used this to set the 'padding' during
    # spanning read discovery
    segments = determine_read_segments(original_read_length, 
                                       segment_length=runconfig.segment_length, 
                                       segment_trim=True, 
                                       trim5=runconfig.trim5,
                                       trim3=runconfig.trim3)
    segmented_read_length = segments[-1][1]
    logging.debug("Segmented alignment will use effective read length of %d" % 
                  (segmented_read_length))
    if all(up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Discordant alignment results exist")
    else:
        logging.info("Aligning initially unmapped reads in single read mode")
        align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index,
              discordant_bam_file, 
              bowtie_bin=runconfig.bowtie_bin,
              num_processors=runconfig.num_processors, 
              segment_length=runconfig.segment_length,
              segment_trim=True,
              trim5=runconfig.trim5, 
              trim3=runconfig.trim3, 
              multihits=runconfig.multihits,
              mismatches=runconfig.mismatches, 
              bowtie_mode=bowtie_mode,
              best_strata=runconfig.best_strata,
              log_file=discordant_log_file)
    #
    # Merge paired-end reads step
    #
    paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE)
    if up_to_date(paired_bam_file, discordant_bam_file):
        logging.info("[SKIPPED] Read pairing results exist")
    else:
        logging.info("Pairing aligned reads")
        bamfh = pysam.Samfile(discordant_bam_file, "rb")
        paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
        merge_read_pairs(bamfh, paired_bamfh, 
                         runconfig.min_fragment_length,
                         runconfig.max_fragment_length,
                         library_type)
        paired_bamfh.close() 
        bamfh.close()
    #
    # Find discordant reads step
    #
    discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE)
    discordant_genome_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE)
    padding = original_read_length - segmented_read_length
    if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and
        up_to_date(discordant_genome_bedpe_file, paired_bam_file)):
        logging.info("[SKIPPED] Finding discordant reads")
    else:
        logging.info("Finding discordant reads")
        bamfh = pysam.Samfile(paired_bam_file, "rb")
        find_discordant_reads(bamfh, 
                              discordant_gene_bedpe_file,
                              discordant_genome_bedpe_file, 
                              gene_feature_file,
                              max_indel_size=runconfig.max_indel_size,
                              max_isize=runconfig.max_fragment_length,
                              max_multihits=runconfig.multihits,
                              library_type=library_type,
                              padding=padding)
        bamfh.close()
    #
    # Extract full sequences of the discordant reads
    #
    extended_discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, 
                     config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE)
    if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file):
        logging.info("[SKIPPED] Retrieving full length sequences for realignment")
    else:
        logging.info("Retrieving full length sequences for realignment")
        extend_sequences(unaligned_fastq_files, 
                         discordant_gene_bedpe_file,
                         extended_discordant_gene_bedpe_file)
    #
    # Sort discordant reads
    #
    sorted_discordant_gene_bedpe_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE)
    if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Sorting discordant BEDPE file")
    else:        
        logging.info("Sorting discordant BEDPE file")
        sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file)        
    #
    # Nominate chimeras step
    #
    encompassing_bedpe_file = os.path.join(tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE)        
    if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Nominating chimeras from discordant reads")
    else:        
        logging.info("Nominating chimeras from discordant reads")
        nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"),
                          open(encompassing_bedpe_file, "w"),
                          gene_feature_file,                          
                          trim=config.EXON_JUNCTION_TRIM_BP)
    #
    # Filter encompassing chimeras step
    #
    filtered_encomp_bedpe_file = \
        os.path.join(tmp_dir,
                     config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)):
        logging.info("[SKIPPED] Filtering encompassing chimeras")
    else:
        logging.info("Filtering encompassing chimeras")
        # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std
        filter_encompassing_chimeras(encompassing_bedpe_file,
                                     filtered_encomp_bedpe_file,
                                     gene_feature_file,
                                     max_multimap=runconfig.filter_max_multimaps,
                                     multimap_cov_ratio=runconfig.filter_multimap_ratio,
                                     max_isize=-1,
                                     strand_pval=runconfig.filter_strand_pval)
    #
    # Nominate spanning reads step
    #
    spanning_fastq_file = os.path.join(runconfig.output_dir, 
                                       config.SPANNING_FASTQ_FILE)
    if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files):
        logging.info("[SKIPPED] Preparing junction spanning reads")
    else:
        logging.info("Preparing junction spanning reads")
        outfh = open(spanning_fastq_file, "w")
        for f in unaligned_fastq_files:
            shutil.copyfileobj(open(f), outfh)
        outfh.close()        
    # TODO: skip this step for now, and simply realign all the reads
#    spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE)
#    if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and 
#        up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)):
#        logging.info("[SKIPPED] Nominating junction spanning reads")
#    else:
#        logging.info("Nominating junction spanning reads")
#        nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'),
#                                open(filtered_encomp_bedpe_file, 'r'),
#                                open(spanning_fastq_file, 'w'))    
    #
    # Extract junction sequences from chimeras file
    #        
    ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa")
    junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE)
    junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE)
    spanning_read_length = get_read_length(spanning_fastq_file)    
    if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and
        up_to_date(junc_map_file, filtered_encomp_bedpe_file)):        
        logging.info("[SKIPPED] Extracting junction read sequences")
    else:        
        logging.info("Extracting junction read sequences")
        bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file,                                
                                spanning_read_length, 
                                open(junc_fasta_file, "w"),
                                open(junc_map_file, "w"))
    #
    # Build a bowtie index to align and detect spanning reads
    #
    bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX)
    bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE)
    if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)):
        logging.info("[SKIPPED] Building bowtie index for junction-spanning reads")
    else:        
        logging.info("Building bowtie index for junction-spanning reads")
        args = [runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index]
        f = open(os.path.join(log_dir, "bowtie_build.log"), "w")
        subprocess.call(args, stdout=f, stderr=f)
        f.close()
    #
    # Align unmapped reads across putative junctions
    #
    junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE)
    junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log")        
    if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and
        up_to_date(junc_bam_file, spanning_fastq_file)):
        logging.info("[SKIPPED] Aligning junction spanning reads")
    else:            
        logging.info("Aligning junction spanning reads")
        retcode = align_sr_full(spanning_fastq_file, 
                                bowtie_spanning_index,
                                junc_bam_file,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,                                 
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=junc_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))    
            sys.exit(retcode)
    #
    # Merge spanning and encompassing read information
    #
    raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE)
    if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and
        up_to_date(raw_chimera_bedpe_file, junc_map_file)):
        logging.info("[SKIPPED] Merging spanning and encompassing read alignments")
    else:
        logging.info("Merging spanning and encompassing read alignments")
        merge_spanning_alignments(junc_bam_file, junc_map_file, 
                                  raw_chimera_bedpe_file,
                                  anchor_min=0, 
                                  anchor_max=0,
                                  anchor_mismatches=0)
    #
    # Choose best isoform for each junction
    #
    chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE)
    if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)):
        logging.info("[SKIPPED] Filtering chimeras")
    else:
        logging.info("Filtering chimeras")
        # get insert size at prob    
        max_isize = isize_dist.percentile(runconfig.filter_isize_percentile)
        filter_spanning_chimeras(raw_chimera_bedpe_file, 
                                 chimera_bedpe_file,
                                 gene_feature_file,
                                 mate_pval=runconfig.filter_strand_pval,
                                 max_isize=max_isize)
    #
    # Rank chimeras
    #
    ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, 
                                             config.RANKED_CHIMERA_BEDPE_FILE)
    if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)):
        logging.info("[SKIPPED] Ranking chimeras")
    else:
        logging.info("Ranking chimeras")
        rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file,
                      empirical_prob=runconfig.empirical_prob)
    #
    # Cleanup
    # 
    #shutil.rmtree(tmp_dir)
    #
    # Done
    #    
    logging.info("Finished run. Chimeras written to file %s" %
                 (ranked_chimera_bedpe_file))
    return JOB_SUCCESS