def run_chimerascan(runconfig): # normal run config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") sys.exit(JOB_ERROR) # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # gather and parse run parameters library_type = parse_library_type(runconfig.library_type) gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE) bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n" bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX) original_read_length = get_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3 min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Initial Bowtie alignment step # # align in paired-end mode, trying to resolve as many reads as possible # this effectively rules out the vast majority of reads as candidate # fusions unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM) maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM) aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE) aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log") if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Alignment results exist") else: logging.info("Aligning full-length reads in paired-end mode") retcode = align_pe_full( runconfig.fastq_files, bowtie_index, aligned_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, trim5=runconfig.trim5, trim3=runconfig.trim3, library_type=runconfig.library_type, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=aligned_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) isize_dist = InsertSizeDistribution() if up_to_date(isize_dist_file, aligned_bam_file): logging.info("[SKIPPED] Profiling insert size distribution") isize_dist.from_file(open(isize_dist_file, "r")) else: logging.info("Profiling insert size distribution") max_isize_samples = config.ISIZE_MAX_SAMPLES bamfh = pysam.Samfile(aligned_bam_file, "rb") isize_dist.from_bam(bamfh, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=max_isize_samples) isize_dist.to_file(open(isize_dist_file, "w")) bamfh.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.percentile(50.0), isize_dist.mode())) # # Discordant reads alignment step # discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log") unaligned_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES ] # get the segments used in discordant alignment to know the effective # read length used to align. we used this to set the 'padding' during # spanning read discovery segments = determine_read_segments(original_read_length, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3) segmented_read_length = segments[-1][1] logging.debug("Segmented alignment will use effective read length of %d" % (segmented_read_length)) if all( up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Discordant alignment results exist") else: logging.info("Aligning initially unmapped reads in single read mode") align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index, discordant_bam_file, bowtie_bin=runconfig.bowtie_bin, num_processors=runconfig.num_processors, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_mode=bowtie_mode, best_strata=runconfig.best_strata, log_file=discordant_log_file) # # Merge paired-end reads step # paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE) if up_to_date(paired_bam_file, discordant_bam_file): logging.info("[SKIPPED] Read pairing results exist") else: logging.info("Pairing aligned reads") bamfh = pysam.Samfile(discordant_bam_file, "rb") paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length, runconfig.max_fragment_length, library_type) paired_bamfh.close() bamfh.close() # # Find discordant reads step # discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE) discordant_genome_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE) padding = original_read_length - segmented_read_length if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and up_to_date(discordant_genome_bedpe_file, paired_bam_file)): logging.info("[SKIPPED] Finding discordant reads") else: logging.info("Finding discordant reads") bamfh = pysam.Samfile(paired_bam_file, "rb") find_discordant_reads(bamfh, discordant_gene_bedpe_file, discordant_genome_bedpe_file, gene_feature_file, max_indel_size=runconfig.max_indel_size, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.multihits, library_type=library_type, padding=padding) bamfh.close() # # Extract full sequences of the discordant reads # extended_discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE) if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file): logging.info( "[SKIPPED] Retrieving full length sequences for realignment") else: logging.info("Retrieving full length sequences for realignment") extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file) # # Sort discordant reads # sorted_discordant_gene_bedpe_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE) if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Sorting discordant BEDPE file") else: logging.info("Sorting discordant BEDPE file") sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file) # # Nominate chimeras step # encompassing_bedpe_file = os.path.join( tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Nominating chimeras from discordant reads") else: logging.info("Nominating chimeras from discordant reads") nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"), open(encompassing_bedpe_file, "w"), gene_feature_file, trim=config.EXON_JUNCTION_TRIM_BP) # # Filter encompassing chimeras step # filtered_encomp_bedpe_file = \ os.path.join(tmp_dir, config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)): logging.info("[SKIPPED] Filtering encompassing chimeras") else: logging.info("Filtering encompassing chimeras") # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std filter_encompassing_chimeras( encompassing_bedpe_file, filtered_encomp_bedpe_file, gene_feature_file, max_multimap=runconfig.filter_max_multimaps, multimap_cov_ratio=runconfig.filter_multimap_ratio, max_isize=-1, strand_pval=runconfig.filter_strand_pval) # # Nominate spanning reads step # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files): logging.info("[SKIPPED] Preparing junction spanning reads") else: logging.info("Preparing junction spanning reads") outfh = open(spanning_fastq_file, "w") for f in unaligned_fastq_files: shutil.copyfileobj(open(f), outfh) outfh.close() # TODO: skip this step for now, and simply realign all the reads # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) # if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and # up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)): # logging.info("[SKIPPED] Nominating junction spanning reads") # else: # logging.info("Nominating junction spanning reads") # nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'), # open(filtered_encomp_bedpe_file, 'r'), # open(spanning_fastq_file, 'w')) # # Extract junction sequences from chimeras file # ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa") junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE) junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE) spanning_read_length = get_read_length(spanning_fastq_file) if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and up_to_date(junc_map_file, filtered_encomp_bedpe_file)): logging.info("[SKIPPED] Extracting junction read sequences") else: logging.info("Extracting junction read sequences") bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file, spanning_read_length, open(junc_fasta_file, "w"), open(junc_map_file, "w")) # # Build a bowtie index to align and detect spanning reads # bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX) bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE) if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)): logging.info( "[SKIPPED] Building bowtie index for junction-spanning reads") else: logging.info("Building bowtie index for junction-spanning reads") args = [ runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index ] f = open(os.path.join(log_dir, "bowtie_build.log"), "w") subprocess.call(args, stdout=f, stderr=f) f.close() # # Align unmapped reads across putative junctions # junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE) junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log") if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and up_to_date(junc_bam_file, spanning_fastq_file)): logging.info("[SKIPPED] Aligning junction spanning reads") else: logging.info("Aligning junction spanning reads") retcode = align_sr_full(spanning_fastq_file, bowtie_spanning_index, junc_bam_file, trim5=runconfig.trim5, trim3=runconfig.trim3, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=junc_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Merge spanning and encompassing read information # raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE) if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and up_to_date(raw_chimera_bedpe_file, junc_map_file)): logging.info( "[SKIPPED] Merging spanning and encompassing read alignments") else: logging.info("Merging spanning and encompassing read alignments") merge_spanning_alignments(junc_bam_file, junc_map_file, raw_chimera_bedpe_file, anchor_min=0, anchor_max=0, anchor_mismatches=0) # # Choose best isoform for each junction # chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE) if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)): logging.info("[SKIPPED] Filtering chimeras") else: logging.info("Filtering chimeras") # get insert size at prob max_isize = isize_dist.percentile(runconfig.filter_isize_percentile) filter_spanning_chimeras(raw_chimera_bedpe_file, chimera_bedpe_file, gene_feature_file, mate_pval=runconfig.filter_strand_pval, max_isize=max_isize) # # Rank chimeras # ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.RANKED_CHIMERA_BEDPE_FILE) if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)): logging.info("[SKIPPED] Ranking chimeras") else: logging.info("Ranking chimeras") rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file, empirical_prob=runconfig.empirical_prob) # # Cleanup # #shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run. Chimeras written to file %s" % (ranked_chimera_bedpe_file)) return JOB_SUCCESS
def run_chimerascan(runconfig): # normal run config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") sys.exit(JOB_ERROR) # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) fh = open(runconfig_xml_file, "w") print >>fh, xmlstring fh.close() # gather and parse run parameters library_type = parse_library_type(runconfig.library_type) gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE) bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n" bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX) original_read_length = get_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3 min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Initial Bowtie alignment step # # align in paired-end mode, trying to resolve as many reads as possible # this effectively rules out the vast majority of reads as candidate # fusions unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM) maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM) aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE) aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log") if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Alignment results exist") else: logging.info("Aligning full-length reads in paired-end mode") retcode = align_pe_full(runconfig.fastq_files, bowtie_index, aligned_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, trim5=runconfig.trim5, trim3=runconfig.trim3, library_type=runconfig.library_type, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=aligned_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) isize_dist = InsertSizeDistribution() if up_to_date(isize_dist_file, aligned_bam_file): logging.info("[SKIPPED] Profiling insert size distribution") isize_dist.from_file(open(isize_dist_file, "r")) else: logging.info("Profiling insert size distribution") max_isize_samples = config.ISIZE_MAX_SAMPLES bamfh = pysam.Samfile(aligned_bam_file, "rb") isize_dist.from_bam(bamfh, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=max_isize_samples) isize_dist.to_file(open(isize_dist_file, "w")) bamfh.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.percentile(50.0), isize_dist.mode())) # # Discordant reads alignment step # discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log") unaligned_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES] # get the segments used in discordant alignment to know the effective # read length used to align. we used this to set the 'padding' during # spanning read discovery segments = determine_read_segments(original_read_length, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3) segmented_read_length = segments[-1][1] logging.debug("Segmented alignment will use effective read length of %d" % (segmented_read_length)) if all(up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Discordant alignment results exist") else: logging.info("Aligning initially unmapped reads in single read mode") align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index, discordant_bam_file, bowtie_bin=runconfig.bowtie_bin, num_processors=runconfig.num_processors, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_mode=bowtie_mode, best_strata=runconfig.best_strata, log_file=discordant_log_file) # # Merge paired-end reads step # paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE) if up_to_date(paired_bam_file, discordant_bam_file): logging.info("[SKIPPED] Read pairing results exist") else: logging.info("Pairing aligned reads") bamfh = pysam.Samfile(discordant_bam_file, "rb") paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length, runconfig.max_fragment_length, library_type) paired_bamfh.close() bamfh.close() # # Find discordant reads step # discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE) discordant_genome_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE) padding = original_read_length - segmented_read_length if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and up_to_date(discordant_genome_bedpe_file, paired_bam_file)): logging.info("[SKIPPED] Finding discordant reads") else: logging.info("Finding discordant reads") bamfh = pysam.Samfile(paired_bam_file, "rb") find_discordant_reads(bamfh, discordant_gene_bedpe_file, discordant_genome_bedpe_file, gene_feature_file, max_indel_size=runconfig.max_indel_size, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.multihits, library_type=library_type, padding=padding) bamfh.close() # # Extract full sequences of the discordant reads # extended_discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE) if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file): logging.info("[SKIPPED] Retrieving full length sequences for realignment") else: logging.info("Retrieving full length sequences for realignment") extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file) # # Sort discordant reads # sorted_discordant_gene_bedpe_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE) if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Sorting discordant BEDPE file") else: logging.info("Sorting discordant BEDPE file") sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file) # # Nominate chimeras step # encompassing_bedpe_file = os.path.join(tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Nominating chimeras from discordant reads") else: logging.info("Nominating chimeras from discordant reads") nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"), open(encompassing_bedpe_file, "w"), gene_feature_file, trim=config.EXON_JUNCTION_TRIM_BP) # # Filter encompassing chimeras step # filtered_encomp_bedpe_file = \ os.path.join(tmp_dir, config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)): logging.info("[SKIPPED] Filtering encompassing chimeras") else: logging.info("Filtering encompassing chimeras") # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std filter_encompassing_chimeras(encompassing_bedpe_file, filtered_encomp_bedpe_file, gene_feature_file, max_multimap=runconfig.filter_max_multimaps, multimap_cov_ratio=runconfig.filter_multimap_ratio, max_isize=-1, strand_pval=runconfig.filter_strand_pval) # # Nominate spanning reads step # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files): logging.info("[SKIPPED] Preparing junction spanning reads") else: logging.info("Preparing junction spanning reads") outfh = open(spanning_fastq_file, "w") for f in unaligned_fastq_files: shutil.copyfileobj(open(f), outfh) outfh.close() # TODO: skip this step for now, and simply realign all the reads # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) # if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and # up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)): # logging.info("[SKIPPED] Nominating junction spanning reads") # else: # logging.info("Nominating junction spanning reads") # nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'), # open(filtered_encomp_bedpe_file, 'r'), # open(spanning_fastq_file, 'w')) # # Extract junction sequences from chimeras file # ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa") junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE) junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE) spanning_read_length = get_read_length(spanning_fastq_file) if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and up_to_date(junc_map_file, filtered_encomp_bedpe_file)): logging.info("[SKIPPED] Extracting junction read sequences") else: logging.info("Extracting junction read sequences") bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file, spanning_read_length, open(junc_fasta_file, "w"), open(junc_map_file, "w")) # # Build a bowtie index to align and detect spanning reads # bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX) bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE) if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)): logging.info("[SKIPPED] Building bowtie index for junction-spanning reads") else: logging.info("Building bowtie index for junction-spanning reads") args = [runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index] f = open(os.path.join(log_dir, "bowtie_build.log"), "w") subprocess.call(args, stdout=f, stderr=f) f.close() # # Align unmapped reads across putative junctions # junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE) junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log") if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and up_to_date(junc_bam_file, spanning_fastq_file)): logging.info("[SKIPPED] Aligning junction spanning reads") else: logging.info("Aligning junction spanning reads") retcode = align_sr_full(spanning_fastq_file, bowtie_spanning_index, junc_bam_file, trim5=runconfig.trim5, trim3=runconfig.trim3, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=junc_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Merge spanning and encompassing read information # raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE) if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and up_to_date(raw_chimera_bedpe_file, junc_map_file)): logging.info("[SKIPPED] Merging spanning and encompassing read alignments") else: logging.info("Merging spanning and encompassing read alignments") merge_spanning_alignments(junc_bam_file, junc_map_file, raw_chimera_bedpe_file, anchor_min=0, anchor_max=0, anchor_mismatches=0) # # Choose best isoform for each junction # chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE) if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)): logging.info("[SKIPPED] Filtering chimeras") else: logging.info("Filtering chimeras") # get insert size at prob max_isize = isize_dist.percentile(runconfig.filter_isize_percentile) filter_spanning_chimeras(raw_chimera_bedpe_file, chimera_bedpe_file, gene_feature_file, mate_pval=runconfig.filter_strand_pval, max_isize=max_isize) # # Rank chimeras # ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.RANKED_CHIMERA_BEDPE_FILE) if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)): logging.info("[SKIPPED] Ranking chimeras") else: logging.info("Ranking chimeras") rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file, empirical_prob=runconfig.empirical_prob) # # Cleanup # #shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run. Chimeras written to file %s" % (ranked_chimera_bedpe_file)) return JOB_SUCCESS