Example #1
0
def check_fastq_files(fastq_files, segment_length, trim5, trim3):
    # check that input fastq files exist
    read_lengths = []
    for mate,fastq_file in enumerate(fastq_files):
        if not os.path.isfile(fastq_file):
            raise AlignError("mate '%d' fastq file '%s' is not valid" % 
                             (mate, fastq_file))
        logging.debug("Checking read length for file %s" % (fastq_file))
        read_lengths.append(get_read_length(fastq_file))
        logging.debug("Read length for file %s: %d" % 
                      (fastq_file, read_lengths[-1]))
    # check that mate read lengths are equal
    if len(set(read_lengths)) > 1:
        logging.error("read lengths mate1=%d and mate2=%d are unequal" % 
                      (read_lengths[0], read_lengths[1]))
        return False
    rlen = read_lengths[0]
    trimmed_rlen = rlen - trim5 - trim3
    # check that segment length >= MIN_SEGMENT_LENGTH 
    if segment_length < MIN_SEGMENT_LENGTH:
        raise AlignError("segment length (%d) too small (min is %d)" % 
                         (segment_length, MIN_SEGMENT_LENGTH))
    # check that segment length < trimmed read length
    if segment_length > trimmed_rlen:
        raise AlignError("segment length (%d) longer than trimmed read length (%d)" % 
                         (segment_length, trimmed_rlen))
    return read_lengths[0]
Example #2
0
def check_command_line_args(options, args, parser):
    # check command line arguments
    if len(args) < 3:
        parser.error("Incorrect number of command line arguments")
    fastq_files = args[0:2]
    output_dir = args[2]
    # check that input fastq files exist
    read_lengths = []
    for mate, fastq_file in enumerate(fastq_files):
        if not os.path.isfile(args[0]):
            parser.error("mate '%d' fastq file '%s' is not valid" %
                         (mate, fastq_file))
        logging.debug("Checking read length for file %s" % (fastq_file))
        read_lengths.append(get_read_length(fastq_file))
        logging.debug("Read length for file %s: %d" %
                      (fastq_file, read_lengths[-1]))
    # check that mate read lengths are equal
    if len(set(read_lengths)) > 1:
        parser.error("read lengths mate1=%d and mate2=%d are unequal" %
                     (read_lengths[0], read_lengths[1]))
    # check that seed length < read length
    if any(options.segment_length > rlen for rlen in read_lengths):
        parser.error("seed length %d cannot be longer than read length" %
                     (options.segment_length))
    # check that output dir is not a regular file
    if os.path.exists(output_dir) and (not os.path.isdir(output_dir)):
        parser.error(
            "Output directory name '%s' exists and is not a valid directory" %
            (output_dir))
    if check_executable(options.bowtie_build_bin):
        logging.debug("Checking for 'bowtie-build' binary... found")
    else:
        parser.error("bowtie-build binary not found or not executable")
    # check that bowtie program exists
    if check_executable(options.bowtie_bin):
        logging.debug("Checking for 'bowtie' binary... found")
    else:
        parser.error("bowtie binary not found or not executable")
    # check that alignment index exists
    if os.path.isdir(options.index_dir):
        logging.debug("Checking for chimerascan index directory... found")
    else:
        parser.error("chimerascan alignment index directory '%s' not valid" %
                     (options.index_dir))
    # check that alignment index file exists
    align_index_file = os.path.join(options.index_dir,
                                    config.BOWTIE_INDEX_FILE)
    if os.path.isfile(align_index_file):
        logging.debug("Checking for bowtie index file... found")
    else:
        parser.error("chimerascan bowtie index file '%s' invalid" %
                     (align_index_file))
    # check for sufficient processors
    if options.num_processors < config.BASE_PROCESSORS:
        logging.warning(
            "Please specify >=2 processes using '-p' to allow program to run efficiently"
        )
 def check_config(self):
     # check that input fastq files exist
     config_passed = True
     read_lengths = []
     for mate,fastq_file in enumerate(self.fastq_files):
         if not os.path.isfile(fastq_file):
             logging.error("mate '%d' fastq file '%s' is not valid" % 
                           (mate, fastq_file))
             config_passed = False
         read_lengths.append(get_read_length(fastq_file))
         logging.debug("Checking file %s" % (fastq_file))
         logging.debug("File %s read length=%d" % (fastq_file, read_lengths[-1]))
     # check that mate read lengths are equal
     if len(set(read_lengths)) > 1:
         logging.error("Unequal read lengths mate1=%d and mate2=%d" % 
                       (read_lengths[0], read_lengths[1]))
         config_passed = False
     # check that seed length < read length
     if any(self.segment_length > rlen for rlen in read_lengths):
         logging.error("seed length %d cannot be longer than read length" % 
                      (self.segment_length))
         config_passed = False
     # check that output dir is not a regular file
     if os.path.exists(self.output_dir) and (not os.path.isdir(self.output_dir)):
         logging.error("Output directory name '%s' exists and is not a valid directory" % 
                       (self.output_dir))
         config_passed = False
     if check_executable(self.bowtie_build_bin):
         logging.debug("Checking for 'bowtie-build' binary... found")
     else:
         logging.error("bowtie-build binary not found or not executable")
         config_passed = False
     # check that bowtie program exists
     if check_executable(self.bowtie_bin):
         logging.debug("Checking for 'bowtie' binary... found")
     else:
         logging.error("bowtie binary not found or not executable")
         config_passed = False
     # check that alignment index exists
     if os.path.isdir(self.index_dir):
         logging.debug("Checking for chimerascan index directory... found")
         # check that alignment index file exists
         align_index_file = os.path.join(self.index_dir, config.BOWTIE_INDEX_FILE)
         if os.path.isfile(align_index_file):
             logging.debug("Checking for bowtie index file... found")
         else:
             logging.error("chimerascan bowtie index file '%s' invalid" % (align_index_file))
             config_passed = False
     else:
         logging.error("chimerascan alignment index directory '%s' not valid" % 
                       (self.index_dir))
         config_passed = False
     # check for sufficient processors
     if self.num_processors < config.BASE_PROCESSORS:
         logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently")
     return config_passed
Example #4
0
def align_pe_full(fastq_files, 
                  bowtie_index,
                  output_bam_file, 
                  unaligned_fastq_param,
                  maxmultimap_fastq_param,
                  min_fragment_length=0,
                  max_fragment_length=1000,
                  trim5=0,
                  trim3=0,
                  library_type="fr",
                  num_processors=1, 
                  fastq_format="phred33-quals", 
                  multihits=100, 
                  mismatches=2, 
                  bowtie_bin="bowtie", 
                  bowtie_mode="-n",
                  log_file=None):
    read_length = get_read_length(fastq_files[0])     
    args = [bowtie_bin, "-q", "-S", 
            "-p", str(num_processors),
            "--%s" % fastq_format,
            "-k", str(multihits),
            "-m", str(multihits),
            bowtie_mode, str(mismatches),
            "--minins", min_fragment_length,
            "--maxins", max_fragment_length,
            "--trim5", trim5,
            "--trim3", trim3,
            "--%s" % library_type,
            "--un", unaligned_fastq_param,
            "--max", maxmultimap_fastq_param]
    # use the entire read length as the "seed" here
    if bowtie_mode == "-n":
        args.extend(["-l", str(read_length)])
    args += [bowtie_index, 
             "-1", fastq_files[0],
             "-2", fastq_files[1]]
    #aligned_sam_file]
    args = map(str, args)
    logging.debug("Bowtie alignment args: %s" % (' '.join(args)))
    aln_p = subprocess.Popen(args, stdout=subprocess.PIPE)
    # pipe the bowtie SAM output to a filter that writes BAM format
    args = [sys.executable, __file__, "--multihits", str(multihits),
            output_bam_file, fastq_files[0]]
    logging.debug("SAM to BAM converter args: %s" % (' '.join(args)))
    if log_file is not None:
        logfh = open(log_file, "w")
    else:
        logfh = None    
    retcode = subprocess.call(args, stdin=aln_p.stdout, stderr=logfh)
    if logfh is not None:
        logfh.close()
    if retcode != 0:
        return retcode
    return aln_p.wait()
Example #5
0
def align_pe_full(fastq_files,
                  bowtie_index,
                  output_bam_file,
                  unaligned_fastq_param,
                  maxmultimap_fastq_param,
                  min_fragment_length=0,
                  max_fragment_length=1000,
                  trim5=0,
                  trim3=0,
                  library_type="fr",
                  num_processors=1,
                  fastq_format="phred33-quals",
                  multihits=100,
                  mismatches=2,
                  bowtie_bin="bowtie",
                  bowtie_mode="-n",
                  log_file=None):
    read_length = get_read_length(fastq_files[0])
    args = [
        bowtie_bin, "-q", "-S", "-p",
        str(num_processors),
        "--%s" % fastq_format, "-k",
        str(multihits), "-m",
        str(multihits), bowtie_mode,
        str(mismatches), "--minins", min_fragment_length, "--maxins",
        max_fragment_length, "--trim5", trim5, "--trim3", trim3,
        "--%s" % library_type, "--un", unaligned_fastq_param, "--max",
        maxmultimap_fastq_param
    ]
    # use the entire read length as the "seed" here
    if bowtie_mode == "-n":
        args.extend(["-l", str(read_length)])
    args += [bowtie_index, "-1", fastq_files[0], "-2", fastq_files[1]]
    #aligned_sam_file]
    args = map(str, args)
    logging.debug("Bowtie alignment args: %s" % (' '.join(args)))
    aln_p = subprocess.Popen(args, stdout=subprocess.PIPE)
    # pipe the bowtie SAM output to a filter that writes BAM format
    args = [
        sys.executable, __file__, "--multihits",
        str(multihits), output_bam_file, fastq_files[0]
    ]
    logging.debug("SAM to BAM converter args: %s" % (' '.join(args)))
    if log_file is not None:
        logfh = open(log_file, "w")
    else:
        logfh = None
    retcode = subprocess.call(args, stdin=aln_p.stdout, stderr=logfh)
    if logfh is not None:
        logfh.close()
    if retcode != 0:
        return retcode
    return aln_p.wait()
def check_command_line_args(options, args, parser):
    # check command line arguments
    if len(args) < 3:
        parser.error("Incorrect number of command line arguments")
    fastq_files = args[0:2]
    output_dir = args[2]
    # check that input fastq files exist
    read_lengths = []
    for mate,fastq_file in enumerate(fastq_files):
        if not os.path.isfile(args[0]):
            parser.error("mate '%d' fastq file '%s' is not valid" % 
                         (mate, fastq_file))
        logging.debug("Checking read length for file %s" % 
                      (fastq_file))
        read_lengths.append(get_read_length(fastq_file))
        logging.debug("Read length for file %s: %d" % 
                      (fastq_file, read_lengths[-1]))
    # check that mate read lengths are equal
    if len(set(read_lengths)) > 1:
        parser.error("read lengths mate1=%d and mate2=%d are unequal" % 
                     (read_lengths[0], read_lengths[1]))
    # check that seed length < read length
    if any(options.segment_length > rlen for rlen in read_lengths):
        parser.error("seed length %d cannot be longer than read length" % 
                     (options.segment_length))
    # check that output dir is not a regular file
    if os.path.exists(output_dir) and (not os.path.isdir(output_dir)):
        parser.error("Output directory name '%s' exists and is not a valid directory" % 
                     (output_dir))
    if check_executable(options.bowtie_build_bin):
        logging.debug("Checking for 'bowtie-build' binary... found")
    else:
        parser.error("bowtie-build binary not found or not executable")
    # check that bowtie program exists
    if check_executable(options.bowtie_bin):
        logging.debug("Checking for 'bowtie' binary... found")
    else:
        parser.error("bowtie binary not found or not executable")
    # check that alignment index exists
    if os.path.isdir(options.index_dir):
        logging.debug("Checking for chimerascan index directory... found")
    else:
        parser.error("chimerascan alignment index directory '%s' not valid" % 
                     (options.index_dir))
    # check that alignment index file exists
    align_index_file = os.path.join(options.index_dir, config.BOWTIE_INDEX_FILE)
    if os.path.isfile(align_index_file):
        logging.debug("Checking for bowtie index file... found")
    else:
        parser.error("chimerascan bowtie index file '%s' invalid" % (align_index_file))
    # check for sufficient processors
    if options.num_processors < config.BASE_PROCESSORS:
        logging.warning("Please specify >=2 processes using '-p' to allow program to run efficiently")
Example #7
0
def run_chimerascan(runconfig):
    # normal run
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        sys.exit(JOB_ERROR)
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # gather and parse run parameters
    library_type = parse_library_type(runconfig.library_type)
    gene_feature_file = os.path.join(runconfig.index_dir,
                                     config.GENE_FEATURE_FILE)
    bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n"
    bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX)
    original_read_length = get_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Initial Bowtie alignment step
    #
    # align in paired-end mode, trying to resolve as many reads as possible
    # this effectively rules out the vast majority of reads as candidate
    # fusions
    unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM)
    maxmultimap_fastq_param = os.path.join(tmp_dir,
                                           config.MAXMULTIMAP_FASTQ_PARAM)
    aligned_bam_file = os.path.join(runconfig.output_dir,
                                    config.ALIGNED_READS_BAM_FILE)
    aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log")
    if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Alignment results exist")
    else:
        logging.info("Aligning full-length reads in paired-end mode")
        retcode = align_pe_full(
            runconfig.fastq_files,
            bowtie_index,
            aligned_bam_file,
            unaligned_fastq_param,
            maxmultimap_fastq_param,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            trim5=runconfig.trim5,
            trim3=runconfig.trim3,
            library_type=runconfig.library_type,
            num_processors=runconfig.num_processors,
            fastq_format=runconfig.fastq_format,
            multihits=runconfig.multihits,
            mismatches=runconfig.mismatches,
            bowtie_bin=runconfig.bowtie_bin,
            bowtie_mode=bowtie_mode,
            log_file=aligned_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    isize_dist = InsertSizeDistribution()
    if up_to_date(isize_dist_file, aligned_bam_file):
        logging.info("[SKIPPED] Profiling insert size distribution")
        isize_dist.from_file(open(isize_dist_file, "r"))
    else:
        logging.info("Profiling insert size distribution")
        max_isize_samples = config.ISIZE_MAX_SAMPLES
        bamfh = pysam.Samfile(aligned_bam_file, "rb")
        isize_dist.from_bam(bamfh,
                            min_isize=min_fragment_length,
                            max_isize=runconfig.max_fragment_length,
                            max_samples=max_isize_samples)
        isize_dist.to_file(open(isize_dist_file, "w"))
        bamfh.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.percentile(50.0), isize_dist.mode()))
    #
    # Discordant reads alignment step
    #
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    discordant_log_file = os.path.join(log_dir,
                                       "bowtie_segmented_alignment.log")
    unaligned_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES
    ]
    # get the segments used in discordant alignment to know the effective
    # read length used to align.  we used this to set the 'padding' during
    # spanning read discovery
    segments = determine_read_segments(original_read_length,
                                       segment_length=runconfig.segment_length,
                                       segment_trim=True,
                                       trim5=runconfig.trim5,
                                       trim3=runconfig.trim3)
    segmented_read_length = segments[-1][1]
    logging.debug("Segmented alignment will use effective read length of %d" %
                  (segmented_read_length))
    if all(
            up_to_date(discordant_bam_file, fq)
            for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Discordant alignment results exist")
    else:
        logging.info("Aligning initially unmapped reads in single read mode")
        align(unaligned_fastq_files,
              runconfig.fastq_format,
              bowtie_index,
              discordant_bam_file,
              bowtie_bin=runconfig.bowtie_bin,
              num_processors=runconfig.num_processors,
              segment_length=runconfig.segment_length,
              segment_trim=True,
              trim5=runconfig.trim5,
              trim3=runconfig.trim3,
              multihits=runconfig.multihits,
              mismatches=runconfig.mismatches,
              bowtie_mode=bowtie_mode,
              best_strata=runconfig.best_strata,
              log_file=discordant_log_file)
    #
    # Merge paired-end reads step
    #
    paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE)
    if up_to_date(paired_bam_file, discordant_bam_file):
        logging.info("[SKIPPED] Read pairing results exist")
    else:
        logging.info("Pairing aligned reads")
        bamfh = pysam.Samfile(discordant_bam_file, "rb")
        paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
        merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length,
                         runconfig.max_fragment_length, library_type)
        paired_bamfh.close()
        bamfh.close()
    #
    # Find discordant reads step
    #
    discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE)
    discordant_genome_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE)
    padding = original_read_length - segmented_read_length
    if (up_to_date(discordant_gene_bedpe_file, paired_bam_file)
            and up_to_date(discordant_genome_bedpe_file, paired_bam_file)):
        logging.info("[SKIPPED] Finding discordant reads")
    else:
        logging.info("Finding discordant reads")
        bamfh = pysam.Samfile(paired_bam_file, "rb")
        find_discordant_reads(bamfh,
                              discordant_gene_bedpe_file,
                              discordant_genome_bedpe_file,
                              gene_feature_file,
                              max_indel_size=runconfig.max_indel_size,
                              max_isize=runconfig.max_fragment_length,
                              max_multihits=runconfig.multihits,
                              library_type=library_type,
                              padding=padding)
        bamfh.close()
    #
    # Extract full sequences of the discordant reads
    #
    extended_discordant_gene_bedpe_file = \
        os.path.join(tmp_dir,
                     config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE)
    if up_to_date(extended_discordant_gene_bedpe_file,
                  discordant_gene_bedpe_file):
        logging.info(
            "[SKIPPED] Retrieving full length sequences for realignment")
    else:
        logging.info("Retrieving full length sequences for realignment")
        extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file,
                         extended_discordant_gene_bedpe_file)
    #
    # Sort discordant reads
    #
    sorted_discordant_gene_bedpe_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE)
    if (up_to_date(sorted_discordant_gene_bedpe_file,
                   extended_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Sorting discordant BEDPE file")
    else:
        logging.info("Sorting discordant BEDPE file")
        sort_discordant_reads(extended_discordant_gene_bedpe_file,
                              sorted_discordant_gene_bedpe_file)
    #
    # Nominate chimeras step
    #
    encompassing_bedpe_file = os.path.join(
        tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(encompassing_bedpe_file,
                   sorted_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Nominating chimeras from discordant reads")
    else:
        logging.info("Nominating chimeras from discordant reads")
        nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"),
                          open(encompassing_bedpe_file, "w"),
                          gene_feature_file,
                          trim=config.EXON_JUNCTION_TRIM_BP)
    #
    # Filter encompassing chimeras step
    #
    filtered_encomp_bedpe_file = \
        os.path.join(tmp_dir,
                     config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)):
        logging.info("[SKIPPED] Filtering encompassing chimeras")
    else:
        logging.info("Filtering encompassing chimeras")
        # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std
        filter_encompassing_chimeras(
            encompassing_bedpe_file,
            filtered_encomp_bedpe_file,
            gene_feature_file,
            max_multimap=runconfig.filter_max_multimaps,
            multimap_cov_ratio=runconfig.filter_multimap_ratio,
            max_isize=-1,
            strand_pval=runconfig.filter_strand_pval)
    #
    # Nominate spanning reads step
    #
    spanning_fastq_file = os.path.join(runconfig.output_dir,
                                       config.SPANNING_FASTQ_FILE)
    if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files):
        logging.info("[SKIPPED] Preparing junction spanning reads")
    else:
        logging.info("Preparing junction spanning reads")
        outfh = open(spanning_fastq_file, "w")
        for f in unaligned_fastq_files:
            shutil.copyfileobj(open(f), outfh)
        outfh.close()
    # TODO: skip this step for now, and simply realign all the reads


#    spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE)
#    if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and
#        up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)):
#        logging.info("[SKIPPED] Nominating junction spanning reads")
#    else:
#        logging.info("Nominating junction spanning reads")
#        nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'),
#                                open(filtered_encomp_bedpe_file, 'r'),
#                                open(spanning_fastq_file, 'w'))
#
# Extract junction sequences from chimeras file
#
    ref_fasta_file = os.path.join(runconfig.index_dir,
                                  config.ALIGN_INDEX + ".fa")
    junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE)
    junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE)
    spanning_read_length = get_read_length(spanning_fastq_file)
    if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file)
            and up_to_date(junc_map_file, filtered_encomp_bedpe_file)):
        logging.info("[SKIPPED] Extracting junction read sequences")
    else:
        logging.info("Extracting junction read sequences")
        bedpe_to_junction_fasta(filtered_encomp_bedpe_file,
                                ref_fasta_file, spanning_read_length,
                                open(junc_fasta_file, "w"),
                                open(junc_map_file, "w"))
    #
    # Build a bowtie index to align and detect spanning reads
    #
    bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX)
    bowtie_spanning_index_file = os.path.join(tmp_dir,
                                              config.JUNC_BOWTIE_INDEX_FILE)
    if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)):
        logging.info(
            "[SKIPPED] Building bowtie index for junction-spanning reads")
    else:
        logging.info("Building bowtie index for junction-spanning reads")
        args = [
            runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index
        ]
        f = open(os.path.join(log_dir, "bowtie_build.log"), "w")
        subprocess.call(args, stdout=f, stderr=f)
        f.close()
    #
    # Align unmapped reads across putative junctions
    #
    junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE)
    junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log")
    if (up_to_date(junc_bam_file, bowtie_spanning_index_file)
            and up_to_date(junc_bam_file, spanning_fastq_file)):
        logging.info("[SKIPPED] Aligning junction spanning reads")
    else:
        logging.info("Aligning junction spanning reads")
        retcode = align_sr_full(spanning_fastq_file,
                                bowtie_spanning_index,
                                junc_bam_file,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=junc_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Merge spanning and encompassing read information
    #
    raw_chimera_bedpe_file = os.path.join(tmp_dir,
                                          config.RAW_CHIMERA_BEDPE_FILE)
    if (up_to_date(raw_chimera_bedpe_file, junc_bam_file)
            and up_to_date(raw_chimera_bedpe_file, junc_map_file)):
        logging.info(
            "[SKIPPED] Merging spanning and encompassing read alignments")
    else:
        logging.info("Merging spanning and encompassing read alignments")
        merge_spanning_alignments(junc_bam_file,
                                  junc_map_file,
                                  raw_chimera_bedpe_file,
                                  anchor_min=0,
                                  anchor_max=0,
                                  anchor_mismatches=0)
    #
    # Choose best isoform for each junction
    #
    chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE)
    if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)):
        logging.info("[SKIPPED] Filtering chimeras")
    else:
        logging.info("Filtering chimeras")
        # get insert size at prob
        max_isize = isize_dist.percentile(runconfig.filter_isize_percentile)
        filter_spanning_chimeras(raw_chimera_bedpe_file,
                                 chimera_bedpe_file,
                                 gene_feature_file,
                                 mate_pval=runconfig.filter_strand_pval,
                                 max_isize=max_isize)
    #
    # Rank chimeras
    #
    ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                             config.RANKED_CHIMERA_BEDPE_FILE)
    if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)):
        logging.info("[SKIPPED] Ranking chimeras")
    else:
        logging.info("Ranking chimeras")
        rank_chimeras(chimera_bedpe_file,
                      ranked_chimera_bedpe_file,
                      empirical_prob=runconfig.empirical_prob)
    #
    # Cleanup
    #
    #shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run. Chimeras written to file %s" %
                 (ranked_chimera_bedpe_file))
    return JOB_SUCCESS
Example #8
0
 def check_config(self):
     # check that input fastq files exist
     config_passed = True
     read_lengths = []
     for mate, fastq_file in enumerate(self.fastq_files):
         if not os.path.isfile(fastq_file):
             logging.error("mate '%d' fastq file '%s' is not valid" %
                           (mate, fastq_file))
             config_passed = False
         read_lengths.append(get_read_length(fastq_file))
         logging.debug("Checking file %s" % (fastq_file))
         logging.debug("File %s read length=%d" %
                       (fastq_file, read_lengths[-1]))
     # check that mate read lengths are equal
     if len(set(read_lengths)) > 1:
         logging.error("Unequal read lengths mate1=%d and mate2=%d" %
                       (read_lengths[0], read_lengths[1]))
         config_passed = False
     # check that seed length < read length
     if any(self.segment_length > rlen for rlen in read_lengths):
         logging.error("seed length %d cannot be longer than read length" %
                       (self.segment_length))
         config_passed = False
     # check that output dir is not a regular file
     if os.path.exists(
             self.output_dir) and (not os.path.isdir(self.output_dir)):
         logging.error(
             "Output directory name '%s' exists and is not a valid directory"
             % (self.output_dir))
         config_passed = False
     if check_executable(self.bowtie_build_bin):
         logging.debug("Checking for 'bowtie-build' binary... found")
     else:
         logging.error("bowtie-build binary not found or not executable")
         config_passed = False
     # check that bowtie program exists
     if check_executable(self.bowtie_bin):
         logging.debug("Checking for 'bowtie' binary... found")
     else:
         logging.error("bowtie binary not found or not executable")
         config_passed = False
     # check that alignment index exists
     if os.path.isdir(self.index_dir):
         logging.debug("Checking for chimerascan index directory... found")
         # check that alignment index file exists
         align_index_file = os.path.join(self.index_dir,
                                         config.BOWTIE_INDEX_FILE)
         if os.path.isfile(align_index_file):
             logging.debug("Checking for bowtie index file... found")
         else:
             logging.error("chimerascan bowtie index file '%s' invalid" %
                           (align_index_file))
             config_passed = False
     else:
         logging.error(
             "chimerascan alignment index directory '%s' not valid" %
             (self.index_dir))
         config_passed = False
     # check for sufficient processors
     if self.num_processors < config.BASE_PROCESSORS:
         logging.warning(
             "Please specify >=2 processes using '-p' to allow program to run efficiently"
         )
     return config_passed
def run_chimerascan(runconfig):
    # normal run
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        sys.exit(JOB_ERROR)
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))        
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE)
    fh = open(runconfig_xml_file, "w")
    print >>fh, xmlstring
    fh.close()
    # gather and parse run parameters
    library_type = parse_library_type(runconfig.library_type)    
    gene_feature_file = os.path.join(runconfig.index_dir, config.GENE_FEATURE_FILE)
    bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n"
    bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX)
    original_read_length = get_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3
    min_fragment_length = max(runconfig.min_fragment_length, 
                              trimmed_read_length)
    #
    # Initial Bowtie alignment step
    #
    # align in paired-end mode, trying to resolve as many reads as possible
    # this effectively rules out the vast majority of reads as candidate
    # fusions
    unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM)
    maxmultimap_fastq_param = os.path.join(tmp_dir, config.MAXMULTIMAP_FASTQ_PARAM)
    aligned_bam_file = os.path.join(runconfig.output_dir, config.ALIGNED_READS_BAM_FILE)
    aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log")    
    if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Alignment results exist")
    else:    
        logging.info("Aligning full-length reads in paired-end mode")
        retcode = align_pe_full(runconfig.fastq_files, 
                                bowtie_index,
                                aligned_bam_file, 
                                unaligned_fastq_param,
                                maxmultimap_fastq_param,
                                min_fragment_length=min_fragment_length,
                                max_fragment_length=runconfig.max_fragment_length,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,
                                library_type=runconfig.library_type,
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=aligned_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))    
            sys.exit(retcode)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE)
    isize_dist = InsertSizeDistribution()
    if up_to_date(isize_dist_file, aligned_bam_file):
        logging.info("[SKIPPED] Profiling insert size distribution")
        isize_dist.from_file(open(isize_dist_file, "r"))
    else:
        logging.info("Profiling insert size distribution")
        max_isize_samples = config.ISIZE_MAX_SAMPLES
        bamfh = pysam.Samfile(aligned_bam_file, "rb")
        isize_dist.from_bam(bamfh, min_isize=min_fragment_length, 
                            max_isize=runconfig.max_fragment_length, 
                            max_samples=max_isize_samples)
        isize_dist.to_file(open(isize_dist_file, "w"))
        bamfh.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(), 
                  isize_dist.percentile(50.0), isize_dist.mode()))
    #
    # Discordant reads alignment step
    #
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    discordant_log_file = os.path.join(log_dir, "bowtie_segmented_alignment.log")    
    unaligned_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES]
    # get the segments used in discordant alignment to know the effective
    # read length used to align.  we used this to set the 'padding' during
    # spanning read discovery
    segments = determine_read_segments(original_read_length, 
                                       segment_length=runconfig.segment_length, 
                                       segment_trim=True, 
                                       trim5=runconfig.trim5,
                                       trim3=runconfig.trim3)
    segmented_read_length = segments[-1][1]
    logging.debug("Segmented alignment will use effective read length of %d" % 
                  (segmented_read_length))
    if all(up_to_date(discordant_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Discordant alignment results exist")
    else:
        logging.info("Aligning initially unmapped reads in single read mode")
        align(unaligned_fastq_files, runconfig.fastq_format, bowtie_index,
              discordant_bam_file, 
              bowtie_bin=runconfig.bowtie_bin,
              num_processors=runconfig.num_processors, 
              segment_length=runconfig.segment_length,
              segment_trim=True,
              trim5=runconfig.trim5, 
              trim3=runconfig.trim3, 
              multihits=runconfig.multihits,
              mismatches=runconfig.mismatches, 
              bowtie_mode=bowtie_mode,
              best_strata=runconfig.best_strata,
              log_file=discordant_log_file)
    #
    # Merge paired-end reads step
    #
    paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE)
    if up_to_date(paired_bam_file, discordant_bam_file):
        logging.info("[SKIPPED] Read pairing results exist")
    else:
        logging.info("Pairing aligned reads")
        bamfh = pysam.Samfile(discordant_bam_file, "rb")
        paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
        merge_read_pairs(bamfh, paired_bamfh, 
                         runconfig.min_fragment_length,
                         runconfig.max_fragment_length,
                         library_type)
        paired_bamfh.close() 
        bamfh.close()
    #
    # Find discordant reads step
    #
    discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE)
    discordant_genome_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE)
    padding = original_read_length - segmented_read_length
    if (up_to_date(discordant_gene_bedpe_file, paired_bam_file) and
        up_to_date(discordant_genome_bedpe_file, paired_bam_file)):
        logging.info("[SKIPPED] Finding discordant reads")
    else:
        logging.info("Finding discordant reads")
        bamfh = pysam.Samfile(paired_bam_file, "rb")
        find_discordant_reads(bamfh, 
                              discordant_gene_bedpe_file,
                              discordant_genome_bedpe_file, 
                              gene_feature_file,
                              max_indel_size=runconfig.max_indel_size,
                              max_isize=runconfig.max_fragment_length,
                              max_multihits=runconfig.multihits,
                              library_type=library_type,
                              padding=padding)
        bamfh.close()
    #
    # Extract full sequences of the discordant reads
    #
    extended_discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, 
                     config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE)
    if up_to_date(extended_discordant_gene_bedpe_file, discordant_gene_bedpe_file):
        logging.info("[SKIPPED] Retrieving full length sequences for realignment")
    else:
        logging.info("Retrieving full length sequences for realignment")
        extend_sequences(unaligned_fastq_files, 
                         discordant_gene_bedpe_file,
                         extended_discordant_gene_bedpe_file)
    #
    # Sort discordant reads
    #
    sorted_discordant_gene_bedpe_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE)
    if (up_to_date(sorted_discordant_gene_bedpe_file, extended_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Sorting discordant BEDPE file")
    else:        
        logging.info("Sorting discordant BEDPE file")
        sort_discordant_reads(extended_discordant_gene_bedpe_file, sorted_discordant_gene_bedpe_file)        
    #
    # Nominate chimeras step
    #
    encompassing_bedpe_file = os.path.join(tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE)        
    if (up_to_date(encompassing_bedpe_file, sorted_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Nominating chimeras from discordant reads")
    else:        
        logging.info("Nominating chimeras from discordant reads")
        nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"),
                          open(encompassing_bedpe_file, "w"),
                          gene_feature_file,                          
                          trim=config.EXON_JUNCTION_TRIM_BP)
    #
    # Filter encompassing chimeras step
    #
    filtered_encomp_bedpe_file = \
        os.path.join(tmp_dir,
                     config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)):
        logging.info("[SKIPPED] Filtering encompassing chimeras")
    else:
        logging.info("Filtering encompassing chimeras")
        # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std
        filter_encompassing_chimeras(encompassing_bedpe_file,
                                     filtered_encomp_bedpe_file,
                                     gene_feature_file,
                                     max_multimap=runconfig.filter_max_multimaps,
                                     multimap_cov_ratio=runconfig.filter_multimap_ratio,
                                     max_isize=-1,
                                     strand_pval=runconfig.filter_strand_pval)
    #
    # Nominate spanning reads step
    #
    spanning_fastq_file = os.path.join(runconfig.output_dir, 
                                       config.SPANNING_FASTQ_FILE)
    if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files):
        logging.info("[SKIPPED] Preparing junction spanning reads")
    else:
        logging.info("Preparing junction spanning reads")
        outfh = open(spanning_fastq_file, "w")
        for f in unaligned_fastq_files:
            shutil.copyfileobj(open(f), outfh)
        outfh.close()        
    # TODO: skip this step for now, and simply realign all the reads
#    spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE)
#    if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and 
#        up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)):
#        logging.info("[SKIPPED] Nominating junction spanning reads")
#    else:
#        logging.info("Nominating junction spanning reads")
#        nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'),
#                                open(filtered_encomp_bedpe_file, 'r'),
#                                open(spanning_fastq_file, 'w'))    
    #
    # Extract junction sequences from chimeras file
    #        
    ref_fasta_file = os.path.join(runconfig.index_dir, config.ALIGN_INDEX + ".fa")
    junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE)
    junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE)
    spanning_read_length = get_read_length(spanning_fastq_file)    
    if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file) and
        up_to_date(junc_map_file, filtered_encomp_bedpe_file)):        
        logging.info("[SKIPPED] Extracting junction read sequences")
    else:        
        logging.info("Extracting junction read sequences")
        bedpe_to_junction_fasta(filtered_encomp_bedpe_file, ref_fasta_file,                                
                                spanning_read_length, 
                                open(junc_fasta_file, "w"),
                                open(junc_map_file, "w"))
    #
    # Build a bowtie index to align and detect spanning reads
    #
    bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX)
    bowtie_spanning_index_file = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX_FILE)
    if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)):
        logging.info("[SKIPPED] Building bowtie index for junction-spanning reads")
    else:        
        logging.info("Building bowtie index for junction-spanning reads")
        args = [runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index]
        f = open(os.path.join(log_dir, "bowtie_build.log"), "w")
        subprocess.call(args, stdout=f, stderr=f)
        f.close()
    #
    # Align unmapped reads across putative junctions
    #
    junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE)
    junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log")        
    if (up_to_date(junc_bam_file, bowtie_spanning_index_file) and
        up_to_date(junc_bam_file, spanning_fastq_file)):
        logging.info("[SKIPPED] Aligning junction spanning reads")
    else:            
        logging.info("Aligning junction spanning reads")
        retcode = align_sr_full(spanning_fastq_file, 
                                bowtie_spanning_index,
                                junc_bam_file,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,                                 
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=junc_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))    
            sys.exit(retcode)
    #
    # Merge spanning and encompassing read information
    #
    raw_chimera_bedpe_file = os.path.join(tmp_dir, config.RAW_CHIMERA_BEDPE_FILE)
    if (up_to_date(raw_chimera_bedpe_file, junc_bam_file) and
        up_to_date(raw_chimera_bedpe_file, junc_map_file)):
        logging.info("[SKIPPED] Merging spanning and encompassing read alignments")
    else:
        logging.info("Merging spanning and encompassing read alignments")
        merge_spanning_alignments(junc_bam_file, junc_map_file, 
                                  raw_chimera_bedpe_file,
                                  anchor_min=0, 
                                  anchor_max=0,
                                  anchor_mismatches=0)
    #
    # Choose best isoform for each junction
    #
    chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE)
    if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)):
        logging.info("[SKIPPED] Filtering chimeras")
    else:
        logging.info("Filtering chimeras")
        # get insert size at prob    
        max_isize = isize_dist.percentile(runconfig.filter_isize_percentile)
        filter_spanning_chimeras(raw_chimera_bedpe_file, 
                                 chimera_bedpe_file,
                                 gene_feature_file,
                                 mate_pval=runconfig.filter_strand_pval,
                                 max_isize=max_isize)
    #
    # Rank chimeras
    #
    ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir, 
                                             config.RANKED_CHIMERA_BEDPE_FILE)
    if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)):
        logging.info("[SKIPPED] Ranking chimeras")
    else:
        logging.info("Ranking chimeras")
        rank_chimeras(chimera_bedpe_file, ranked_chimera_bedpe_file,
                      empirical_prob=runconfig.empirical_prob)
    #
    # Cleanup
    # 
    #shutil.rmtree(tmp_dir)
    #
    # Done
    #    
    logging.info("Finished run. Chimeras written to file %s" %
                 (ranked_chimera_bedpe_file))
    return JOB_SUCCESS