def check_fastq_files(fastq_files, segment_length, trim5, trim3): # check that input fastq files exist read_lengths = [] for mate,fastq_file in enumerate(fastq_files): if not os.path.isfile(fastq_file): raise AlignError("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) logging.debug("Checking read length for file %s" % (fastq_file)) read_lengths.append(get_read_length(fastq_file)) logging.debug("Read length for file %s: %d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("read lengths mate1=%d and mate2=%d are unequal" % (read_lengths[0], read_lengths[1])) return False rlen = read_lengths[0] trimmed_rlen = rlen - trim5 - trim3 # check that segment length >= MIN_SEGMENT_LENGTH if segment_length < MIN_SEGMENT_LENGTH: raise AlignError("segment length (%d) too small (min is %d)" % (segment_length, MIN_SEGMENT_LENGTH)) # check that segment length < trimmed read length if segment_length > trimmed_rlen: raise AlignError("segment length (%d) longer than trimmed read length (%d)" % (segment_length, trimmed_rlen)) return read_lengths[0]
def check_command_line_args(options, args, parser): # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") fastq_files = args[0:2] output_dir = args[2] # check that input fastq files exist read_lengths = [] for mate, fastq_file in enumerate(fastq_files): if not os.path.isfile(args[0]): parser.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) logging.debug("Checking read length for file %s" % (fastq_file)) read_lengths.append(get_read_length(fastq_file)) logging.debug("Read length for file %s: %d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: parser.error("read lengths mate1=%d and mate2=%d are unequal" % (read_lengths[0], read_lengths[1])) # check that seed length < read length if any(options.segment_length > rlen for rlen in read_lengths): parser.error("seed length %d cannot be longer than read length" % (options.segment_length)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error( "Output directory name '%s' exists and is not a valid directory" % (output_dir)) if check_executable(options.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # check that bowtie program exists if check_executable(options.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: parser.error("bowtie binary not found or not executable") # check that alignment index exists if os.path.isdir(options.index_dir): logging.debug("Checking for chimerascan index directory... found") else: parser.error("chimerascan alignment index directory '%s' not valid" % (options.index_dir)) # check that alignment index file exists align_index_file = os.path.join(options.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: parser.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) # check for sufficient processors if options.num_processors < config.BASE_PROCESSORS: logging.warning( "Please specify >=2 processes using '-p' to allow program to run efficiently" )
def check_config(self): # check that input fastq files exist config_passed = True read_lengths = [] for mate,fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False read_lengths.append(get_read_length(fastq_file)) logging.debug("Checking file %s" % (fastq_file)) logging.debug("File %s read length=%d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if any(self.segment_length > rlen for rlen in read_lengths): logging.error("seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # check that output dir is not a regular file if os.path.exists(self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error("Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(self.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: logging.error("bowtie-build binary not found or not executable") config_passed = False # check that bowtie program exists if check_executable(self.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: logging.error("bowtie binary not found or not executable") config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index file exists align_index_file = os.path.join(self.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: logging.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) config_passed = False else: logging.error("chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently") return config_passed
def align_pe_full(fastq_files, bowtie_index, output_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=0, max_fragment_length=1000, trim5=0, trim3=0, library_type="fr", num_processors=1, fastq_format="phred33-quals", multihits=100, mismatches=2, bowtie_bin="bowtie", bowtie_mode="-n", log_file=None): read_length = get_read_length(fastq_files[0]) args = [bowtie_bin, "-q", "-S", "-p", str(num_processors), "--%s" % fastq_format, "-k", str(multihits), "-m", str(multihits), bowtie_mode, str(mismatches), "--minins", min_fragment_length, "--maxins", max_fragment_length, "--trim5", trim5, "--trim3", trim3, "--%s" % library_type, "--un", unaligned_fastq_param, "--max", maxmultimap_fastq_param] # use the entire read length as the "seed" here if bowtie_mode == "-n": args.extend(["-l", str(read_length)]) args += [bowtie_index, "-1", fastq_files[0], "-2", fastq_files[1]] #aligned_sam_file] args = map(str, args) logging.debug("Bowtie alignment args: %s" % (' '.join(args))) aln_p = subprocess.Popen(args, stdout=subprocess.PIPE) # pipe the bowtie SAM output to a filter that writes BAM format args = [sys.executable, __file__, "--multihits", str(multihits), output_bam_file, fastq_files[0]] logging.debug("SAM to BAM converter args: %s" % (' '.join(args))) if log_file is not None: logfh = open(log_file, "w") else: logfh = None retcode = subprocess.call(args, stdin=aln_p.stdout, stderr=logfh) if logfh is not None: logfh.close() if retcode != 0: return retcode return aln_p.wait()
def align_pe_full(fastq_files, bowtie_index, output_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=0, max_fragment_length=1000, trim5=0, trim3=0, library_type="fr", num_processors=1, fastq_format="phred33-quals", multihits=100, mismatches=2, bowtie_bin="bowtie", bowtie_mode="-n", log_file=None): read_length = get_read_length(fastq_files[0]) args = [ bowtie_bin, "-q", "-S", "-p", str(num_processors), "--%s" % fastq_format, "-k", str(multihits), "-m", str(multihits), bowtie_mode, str(mismatches), "--minins", min_fragment_length, "--maxins", max_fragment_length, "--trim5", trim5, "--trim3", trim3, "--%s" % library_type, "--un", unaligned_fastq_param, "--max", maxmultimap_fastq_param ] # use the entire read length as the "seed" here if bowtie_mode == "-n": args.extend(["-l", str(read_length)]) args += [bowtie_index, "-1", fastq_files[0], "-2", fastq_files[1]] #aligned_sam_file] args = map(str, args) logging.debug("Bowtie alignment args: %s" % (' '.join(args))) aln_p = subprocess.Popen(args, stdout=subprocess.PIPE) # pipe the bowtie SAM output to a filter that writes BAM format args = [ sys.executable, __file__, "--multihits", str(multihits), output_bam_file, fastq_files[0] ] logging.debug("SAM to BAM converter args: %s" % (' '.join(args))) if log_file is not None: logfh = open(log_file, "w") else: logfh = None retcode = subprocess.call(args, stdin=aln_p.stdout, stderr=logfh) if logfh is not None: logfh.close() if retcode != 0: return retcode return aln_p.wait()
def check_command_line_args(options, args, parser): # check command line arguments if len(args) < 3: parser.error("Incorrect number of command line arguments") fastq_files = args[0:2] output_dir = args[2] # check that input fastq files exist read_lengths = [] for mate,fastq_file in enumerate(fastq_files): if not os.path.isfile(args[0]): parser.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) logging.debug("Checking read length for file %s" % (fastq_file)) read_lengths.append(get_read_length(fastq_file)) logging.debug("Read length for file %s: %d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: parser.error("read lengths mate1=%d and mate2=%d are unequal" % (read_lengths[0], read_lengths[1])) # check that seed length < read length if any(options.segment_length > rlen for rlen in read_lengths): parser.error("seed length %d cannot be longer than read length" % (options.segment_length)) # check that output dir is not a regular file if os.path.exists(output_dir) and (not os.path.isdir(output_dir)): parser.error("Output directory name '%s' exists and is not a valid directory" % (output_dir)) if check_executable(options.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: parser.error("bowtie-build binary not found or not executable") # check that bowtie program exists if check_executable(options.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: parser.error("bowtie binary not found or not executable") # check that alignment index exists if os.path.isdir(options.index_dir): logging.debug("Checking for chimerascan index directory... found") else: parser.error("chimerascan alignment index directory '%s' not valid" % (options.index_dir)) # check that alignment index file exists align_index_file = os.path.join(options.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: parser.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) # check for sufficient processors if options.num_processors < config.BASE_PROCESSORS: logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently")
def run_chimerascan(runconfig): # normal run config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") sys.exit(JOB_ERROR) # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # gather and parse run parameters library_type = parse_library_type(runconfig.library_type) gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE) bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n" bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX) original_read_length = get_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3 min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Initial Bowtie alignment step # # align in paired-end mode, trying to resolve as many reads as possible # this effectively rules out the vast majority of reads as candidate # fusions unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM) maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM) aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE) aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log") if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Alignment results exist") else: logging.info("Aligning full-length reads in paired-end mode") retcode = align_pe_full( runconfig.fastq_files, bowtie_index, aligned_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, trim5=runconfig.trim5, trim3=runconfig.trim3, library_type=runconfig.library_type, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=aligned_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) isize_dist = InsertSizeDistribution() if up_to_date(isize_dist_file, aligned_bam_file): logging.info("[SKIPPED] Profiling insert size distribution") isize_dist.from_file(open(isize_dist_file, "r")) else: logging.info("Profiling insert size distribution") max_isize_samples = config.ISIZE_MAX_SAMPLES bamfh = pysam.Samfile(aligned_bam_file, "rb") isize_dist.from_bam(bamfh, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=max_isize_samples) isize_dist.to_file(open(isize_dist_file, "w")) bamfh.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.percentile(50.0), isize_dist.mode())) # # Discordant reads alignment step # discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log") unaligned_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES ] # get the segments used in discordant alignment to know the effective # read length used to align. we used this to set the 'padding' during # spanning read discovery segments = determine_read_segments(original_read_length, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3) segmented_read_length = segments[-1][1] logging.debug("Segmented alignment will use effective read length of %d" % (segmented_read_length)) if all( up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Discordant alignment results exist") else: logging.info("Aligning initially unmapped reads in single read mode") align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index, discordant_bam_file, bowtie_bin=runconfig.bowtie_bin, num_processors=runconfig.num_processors, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_mode=bowtie_mode, best_strata=runconfig.best_strata, log_file=discordant_log_file) # # Merge paired-end reads step # paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE) if up_to_date(paired_bam_file, discordant_bam_file): logging.info("[SKIPPED] Read pairing results exist") else: logging.info("Pairing aligned reads") bamfh = pysam.Samfile(discordant_bam_file, "rb") paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length, runconfig.max_fragment_length, library_type) paired_bamfh.close() bamfh.close() # # Find discordant reads step # discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE) discordant_genome_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE) padding = original_read_length - segmented_read_length if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and up_to_date(discordant_genome_bedpe_file, paired_bam_file)): logging.info("[SKIPPED] Finding discordant reads") else: logging.info("Finding discordant reads") bamfh = pysam.Samfile(paired_bam_file, "rb") find_discordant_reads(bamfh, discordant_gene_bedpe_file, discordant_genome_bedpe_file, gene_feature_file, max_indel_size=runconfig.max_indel_size, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.multihits, library_type=library_type, padding=padding) bamfh.close() # # Extract full sequences of the discordant reads # extended_discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE) if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file): logging.info( "[SKIPPED] Retrieving full length sequences for realignment") else: logging.info("Retrieving full length sequences for realignment") extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file) # # Sort discordant reads # sorted_discordant_gene_bedpe_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE) if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Sorting discordant BEDPE file") else: logging.info("Sorting discordant BEDPE file") sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file) # # Nominate chimeras step # encompassing_bedpe_file = os.path.join( tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Nominating chimeras from discordant reads") else: logging.info("Nominating chimeras from discordant reads") nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"), open(encompassing_bedpe_file, "w"), gene_feature_file, trim=config.EXON_JUNCTION_TRIM_BP) # # Filter encompassing chimeras step # filtered_encomp_bedpe_file = \ os.path.join(tmp_dir, config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)): logging.info("[SKIPPED] Filtering encompassing chimeras") else: logging.info("Filtering encompassing chimeras") # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std filter_encompassing_chimeras( encompassing_bedpe_file, filtered_encomp_bedpe_file, gene_feature_file, max_multimap=runconfig.filter_max_multimaps, multimap_cov_ratio=runconfig.filter_multimap_ratio, max_isize=-1, strand_pval=runconfig.filter_strand_pval) # # Nominate spanning reads step # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files): logging.info("[SKIPPED] Preparing junction spanning reads") else: logging.info("Preparing junction spanning reads") outfh = open(spanning_fastq_file, "w") for f in unaligned_fastq_files: shutil.copyfileobj(open(f), outfh) outfh.close() # TODO: skip this step for now, and simply realign all the reads # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) # if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and # up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)): # logging.info("[SKIPPED] Nominating junction spanning reads") # else: # logging.info("Nominating junction spanning reads") # nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'), # open(filtered_encomp_bedpe_file, 'r'), # open(spanning_fastq_file, 'w')) # # Extract junction sequences from chimeras file # ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa") junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE) junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE) spanning_read_length = get_read_length(spanning_fastq_file) if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and up_to_date(junc_map_file, filtered_encomp_bedpe_file)): logging.info("[SKIPPED] Extracting junction read sequences") else: logging.info("Extracting junction read sequences") bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file, spanning_read_length, open(junc_fasta_file, "w"), open(junc_map_file, "w")) # # Build a bowtie index to align and detect spanning reads # bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX) bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE) if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)): logging.info( "[SKIPPED] Building bowtie index for junction-spanning reads") else: logging.info("Building bowtie index for junction-spanning reads") args = [ runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index ] f = open(os.path.join(log_dir, "bowtie_build.log"), "w") subprocess.call(args, stdout=f, stderr=f) f.close() # # Align unmapped reads across putative junctions # junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE) junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log") if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and up_to_date(junc_bam_file, spanning_fastq_file)): logging.info("[SKIPPED] Aligning junction spanning reads") else: logging.info("Aligning junction spanning reads") retcode = align_sr_full(spanning_fastq_file, bowtie_spanning_index, junc_bam_file, trim5=runconfig.trim5, trim3=runconfig.trim3, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=junc_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Merge spanning and encompassing read information # raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE) if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and up_to_date(raw_chimera_bedpe_file, junc_map_file)): logging.info( "[SKIPPED] Merging spanning and encompassing read alignments") else: logging.info("Merging spanning and encompassing read alignments") merge_spanning_alignments(junc_bam_file, junc_map_file, raw_chimera_bedpe_file, anchor_min=0, anchor_max=0, anchor_mismatches=0) # # Choose best isoform for each junction # chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE) if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)): logging.info("[SKIPPED] Filtering chimeras") else: logging.info("Filtering chimeras") # get insert size at prob max_isize = isize_dist.percentile(runconfig.filter_isize_percentile) filter_spanning_chimeras(raw_chimera_bedpe_file, chimera_bedpe_file, gene_feature_file, mate_pval=runconfig.filter_strand_pval, max_isize=max_isize) # # Rank chimeras # ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.RANKED_CHIMERA_BEDPE_FILE) if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)): logging.info("[SKIPPED] Ranking chimeras") else: logging.info("Ranking chimeras") rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file, empirical_prob=runconfig.empirical_prob) # # Cleanup # #shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run. Chimeras written to file %s" % (ranked_chimera_bedpe_file)) return JOB_SUCCESS
def check_config(self): # check that input fastq files exist config_passed = True read_lengths = [] for mate, fastq_file in enumerate(self.fastq_files): if not os.path.isfile(fastq_file): logging.error("mate '%d' fastq file '%s' is not valid" % (mate, fastq_file)) config_passed = False read_lengths.append(get_read_length(fastq_file)) logging.debug("Checking file %s" % (fastq_file)) logging.debug("File %s read length=%d" % (fastq_file, read_lengths[-1])) # check that mate read lengths are equal if len(set(read_lengths)) > 1: logging.error("Unequal read lengths mate1=%d and mate2=%d" % (read_lengths[0], read_lengths[1])) config_passed = False # check that seed length < read length if any(self.segment_length > rlen for rlen in read_lengths): logging.error("seed length %d cannot be longer than read length" % (self.segment_length)) config_passed = False # check that output dir is not a regular file if os.path.exists( self.output_dir) and (not os.path.isdir(self.output_dir)): logging.error( "Output directory name '%s' exists and is not a valid directory" % (self.output_dir)) config_passed = False if check_executable(self.bowtie_build_bin): logging.debug("Checking for 'bowtie-build' binary... found") else: logging.error("bowtie-build binary not found or not executable") config_passed = False # check that bowtie program exists if check_executable(self.bowtie_bin): logging.debug("Checking for 'bowtie' binary... found") else: logging.error("bowtie binary not found or not executable") config_passed = False # check that alignment index exists if os.path.isdir(self.index_dir): logging.debug("Checking for chimerascan index directory... found") # check that alignment index file exists align_index_file = os.path.join(self.index_dir, config.BOWTIE_INDEX_FILE) if os.path.isfile(align_index_file): logging.debug("Checking for bowtie index file... found") else: logging.error("chimerascan bowtie index file '%s' invalid" % (align_index_file)) config_passed = False else: logging.error( "chimerascan alignment index directory '%s' not valid" % (self.index_dir)) config_passed = False # check for sufficient processors if self.num_processors < config.BASE_PROCESSORS: logging.warning( "Please specify >=2 processes using '-p' to allow program to run efficiently" ) return config_passed
def run_chimerascan(runconfig): # normal run config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") sys.exit(JOB_ERROR) # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) fh = open(runconfig_xml_file, "w") print >>fh, xmlstring fh.close() # gather and parse run parameters library_type = parse_library_type(runconfig.library_type) gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE) bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n" bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX) original_read_length = get_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3 min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Initial Bowtie alignment step # # align in paired-end mode, trying to resolve as many reads as possible # this effectively rules out the vast majority of reads as candidate # fusions unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM) maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM) aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE) aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log") if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Alignment results exist") else: logging.info("Aligning full-length reads in paired-end mode") retcode = align_pe_full(runconfig.fastq_files, bowtie_index, aligned_bam_file, unaligned_fastq_param, maxmultimap_fastq_param, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, trim5=runconfig.trim5, trim3=runconfig.trim3, library_type=runconfig.library_type, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=aligned_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) isize_dist = InsertSizeDistribution() if up_to_date(isize_dist_file, aligned_bam_file): logging.info("[SKIPPED] Profiling insert size distribution") isize_dist.from_file(open(isize_dist_file, "r")) else: logging.info("Profiling insert size distribution") max_isize_samples = config.ISIZE_MAX_SAMPLES bamfh = pysam.Samfile(aligned_bam_file, "rb") isize_dist.from_bam(bamfh, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=max_isize_samples) isize_dist.to_file(open(isize_dist_file, "w")) bamfh.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.percentile(50.0), isize_dist.mode())) # # Discordant reads alignment step # discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log") unaligned_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES] # get the segments used in discordant alignment to know the effective # read length used to align. we used this to set the 'padding' during # spanning read discovery segments = determine_read_segments(original_read_length, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3) segmented_read_length = segments[-1][1] logging.debug("Segmented alignment will use effective read length of %d" % (segmented_read_length)) if all(up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files): logging.info("[SKIPPED] Discordant alignment results exist") else: logging.info("Aligning initially unmapped reads in single read mode") align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index, discordant_bam_file, bowtie_bin=runconfig.bowtie_bin, num_processors=runconfig.num_processors, segment_length=runconfig.segment_length, segment_trim=True, trim5=runconfig.trim5, trim3=runconfig.trim3, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_mode=bowtie_mode, best_strata=runconfig.best_strata, log_file=discordant_log_file) # # Merge paired-end reads step # paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE) if up_to_date(paired_bam_file, discordant_bam_file): logging.info("[SKIPPED] Read pairing results exist") else: logging.info("Pairing aligned reads") bamfh = pysam.Samfile(discordant_bam_file, "rb") paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length, runconfig.max_fragment_length, library_type) paired_bamfh.close() bamfh.close() # # Find discordant reads step # discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE) discordant_genome_bedpe_file = \ os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE) padding = original_read_length - segmented_read_length if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and up_to_date(discordant_genome_bedpe_file, paired_bam_file)): logging.info("[SKIPPED] Finding discordant reads") else: logging.info("Finding discordant reads") bamfh = pysam.Samfile(paired_bam_file, "rb") find_discordant_reads(bamfh, discordant_gene_bedpe_file, discordant_genome_bedpe_file, gene_feature_file, max_indel_size=runconfig.max_indel_size, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.multihits, library_type=library_type, padding=padding) bamfh.close() # # Extract full sequences of the discordant reads # extended_discordant_gene_bedpe_file = \ os.path.join(tmp_dir, config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE) if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file): logging.info("[SKIPPED] Retrieving full length sequences for realignment") else: logging.info("Retrieving full length sequences for realignment") extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file) # # Sort discordant reads # sorted_discordant_gene_bedpe_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE) if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Sorting discordant BEDPE file") else: logging.info("Sorting discordant BEDPE file") sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file) # # Nominate chimeras step # encompassing_bedpe_file = os.path.join(tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)): logging.info("[SKIPPED] Nominating chimeras from discordant reads") else: logging.info("Nominating chimeras from discordant reads") nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"), open(encompassing_bedpe_file, "w"), gene_feature_file, trim=config.EXON_JUNCTION_TRIM_BP) # # Filter encompassing chimeras step # filtered_encomp_bedpe_file = \ os.path.join(tmp_dir, config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE) if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)): logging.info("[SKIPPED] Filtering encompassing chimeras") else: logging.info("Filtering encompassing chimeras") # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std filter_encompassing_chimeras(encompassing_bedpe_file, filtered_encomp_bedpe_file, gene_feature_file, max_multimap=runconfig.filter_max_multimaps, multimap_cov_ratio=runconfig.filter_multimap_ratio, max_isize=-1, strand_pval=runconfig.filter_strand_pval) # # Nominate spanning reads step # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files): logging.info("[SKIPPED] Preparing junction spanning reads") else: logging.info("Preparing junction spanning reads") outfh = open(spanning_fastq_file, "w") for f in unaligned_fastq_files: shutil.copyfileobj(open(f), outfh) outfh.close() # TODO: skip this step for now, and simply realign all the reads # spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE) # if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and # up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)): # logging.info("[SKIPPED] Nominating junction spanning reads") # else: # logging.info("Nominating junction spanning reads") # nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'), # open(filtered_encomp_bedpe_file, 'r'), # open(spanning_fastq_file, 'w')) # # Extract junction sequences from chimeras file # ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa") junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE) junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE) spanning_read_length = get_read_length(spanning_fastq_file) if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and up_to_date(junc_map_file, filtered_encomp_bedpe_file)): logging.info("[SKIPPED] Extracting junction read sequences") else: logging.info("Extracting junction read sequences") bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file, spanning_read_length, open(junc_fasta_file, "w"), open(junc_map_file, "w")) # # Build a bowtie index to align and detect spanning reads # bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX) bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE) if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)): logging.info("[SKIPPED] Building bowtie index for junction-spanning reads") else: logging.info("Building bowtie index for junction-spanning reads") args = [runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index] f = open(os.path.join(log_dir, "bowtie_build.log"), "w") subprocess.call(args, stdout=f, stderr=f) f.close() # # Align unmapped reads across putative junctions # junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE) junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log") if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and up_to_date(junc_bam_file, spanning_fastq_file)): logging.info("[SKIPPED] Aligning junction spanning reads") else: logging.info("Aligning junction spanning reads") retcode = align_sr_full(spanning_fastq_file, bowtie_spanning_index, junc_bam_file, trim5=runconfig.trim5, trim3=runconfig.trim3, num_processors=runconfig.num_processors, fastq_format=runconfig.fastq_format, multihits=runconfig.multihits, mismatches=runconfig.mismatches, bowtie_bin=runconfig.bowtie_bin, bowtie_mode=bowtie_mode, log_file=junc_log_file) if retcode != 0: logging.error("Bowtie failed with error code %d" % (retcode)) sys.exit(retcode) # # Merge spanning and encompassing read information # raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE) if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and up_to_date(raw_chimera_bedpe_file, junc_map_file)): logging.info("[SKIPPED] Merging spanning and encompassing read alignments") else: logging.info("Merging spanning and encompassing read alignments") merge_spanning_alignments(junc_bam_file, junc_map_file, raw_chimera_bedpe_file, anchor_min=0, anchor_max=0, anchor_mismatches=0) # # Choose best isoform for each junction # chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE) if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)): logging.info("[SKIPPED] Filtering chimeras") else: logging.info("Filtering chimeras") # get insert size at prob max_isize = isize_dist.percentile(runconfig.filter_isize_percentile) filter_spanning_chimeras(raw_chimera_bedpe_file, chimera_bedpe_file, gene_feature_file, mate_pval=runconfig.filter_strand_pval, max_isize=max_isize) # # Rank chimeras # ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.RANKED_CHIMERA_BEDPE_FILE) if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)): logging.info("[SKIPPED] Ranking chimeras") else: logging.info("Ranking chimeras") rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file, empirical_prob=runconfig.empirical_prob) # # Cleanup # #shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run. Chimeras written to file %s" % (ranked_chimera_bedpe_file)) return JOB_SUCCESS