Example #1
0
def output_rpkm(sample,
                output_dir,
                settings_info,
                rna_base,
                logger):
    """
    Output RPKM tables for the sample.

    Takes as input:

    - sample: a sample object
    - output_dir: output directory
    - settings_info: settings information
    - rna_base: an RNABase object
    """
    # Output RPKM information for all constitutive exon tables in the
    # in the RNA Base
    print "Outputting RPKM for: %s" %(sample.label)
    rpkm_tables = {}
    for table_name, const_exons in rna_base.tables_to_const_exons.iteritems():
        rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir,
                                                        table_name))
        rpkm_tables[table_name] = rpkm_output_filename
        if os.path.isfile(rpkm_output_filename):
            logger.info("  - Skipping RPKM output, found %s" %(rpkm_output_filename))
            print "  - Skipping RPKM output, %s exists" %(rpkm_output_filename)
            continue
        # Directory where BAM containing mapping to constitutive
        # exons be stored
        bam2gff_outdir = os.path.join(output_dir,
                                      "bam2gff_const_exons")
        utils.make_dir(bam2gff_outdir)
        # Map reads to GFF of constitutive exons
        # Use the rRNA subtracted BAM file
        print "Using constitutive exons GFF -> %s" %(const_exons.gff_filename)
        exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename,
                                                 const_exons.gff_filename,
                                                 bam2gff_outdir)
        # Compute RPKMs for sample
        num_mapped = int(sample.qc.qc_results["num_mapped"])
        if num_mapped == 0:
            logger.critical("Cannot compute RPKMs since sample %s has 0 mapped reads." \
                            %(sample.label))
            print "Error: Cannot compute RPKMs since sample %s has 0 mapped reads." \
                %(sample.label)
            sys.exit(1)
        print "Sample %s has %s mapped reads" %(sample.label, num_mapped)
        read_len = settings_info["readlen"]
        logger.info("Outputting RPKM from GFF aligned BAM (table %s)" %(table_name))
        output_rpkm_from_gff_aligned_bam(exons_bam_fname,
                                         num_mapped,
                                         read_len,
                                         const_exons,
                                         rpkm_output_filename)
    logger.info("Finished outputting RPKM for %s to %s" %(sample.label,
                                                          rpkm_output_filename))
    return rpkm_output_filename
Example #2
0
def output_rpkm(sample,
                output_dir,
                settings_info,
                rna_base,
                logger):
    """
    Output RPKM tables for the sample.

    Takes as input:

    - sample: a sample object
    - output_dir: output directory
    - settings_info: settings information
    - rna_base: an RNABase object
    """
    # Output RPKM information for all constitutive exon tables in the
    # in the RNA Base
    print "Outputting RPKM for: %s" %(sample.label)
    rpkm_tables = {}
    for table_name, const_exons in rna_base.tables_to_const_exons.iteritems():
        rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir,
                                                        table_name))
        rpkm_tables[table_name] = rpkm_output_filename
        if os.path.isfile(rpkm_output_filename):
            logger.info("  - Skipping RPKM output, found %s" %(rpkm_output_filename))
            continue
        # Directory where BAM containing mapping to constitutive
        # exons be stored
        bam2gff_outdir = os.path.join(output_dir,
                                      "bam2gff_const_exons")
        utils.make_dir(bam2gff_outdir)
        # Map reads to GFF of constitutive exons
        # Use the rRNA subtracted BAM file
        exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename,
                                                 const_exons.gff_filename,
                                                 bam2gff_outdir)
        # Compute RPKMs for sample: use number of ribosub mapped reads
        num_mapped = int(sample.qc.qc_results["num_ribosub_mapped"])
        if num_mapped == 0:
            logger.critical("Cannot compute RPKMs since sample %s has 0 " \
                            "mapped reads." %(sample.label))
            sys.exit(1)
        logger.info("Sample %s has %s mapped reads" %(sample.label, num_mapped))
        read_len = settings_info["readlen"]
        logger.info("Outputting RPKM from GFF aligned BAM (table %s)" \
                    %(table_name))
        output_rpkm_from_gff_aligned_bam(exons_bam_fname,
                                         num_mapped,
                                         read_len,
                                         const_exons,
                                         rpkm_output_filename)
    logger.info("Finished outputting RPKM for %s to %s" %(sample.label,
                                                          rpkm_output_filename))
    return rpkm_output_filename
Example #3
0
def compute_insert_len(bams_to_process,
                       const_exons_gff_filename,
                       output_dir,
                       min_exon_size,
                       no_bam_filter=False,
                       sd_max=2):
    """
    Compute insert length distribution and output it to the given
    directory.

    Arguments:

    - bams_to_process: a list of BAM files to process
    - const_gff_filename: GFF with constitutive exons
    """
    bams_str = "\n  ".join(bams_to_process)
    num_bams = len(bams_to_process)
    print "Computing insert length distribution of %d files:\n  %s" \
          %(num_bams, bams_str)
    print "  - Using const. exons from: %s" %(const_exons_gff_filename)
    print "  - Outputting to: %s" %(output_dir)
    print "  - Minimum exon size used: %d" %(min_exon_size)

    if not os.path.isdir(output_dir):
        print "Making directory: %s" %(output_dir)
        os.makedirs(output_dir)

    all_constitutive = True

    const_exons, f = \
        exon_utils.get_const_exons_by_gene(const_exons_gff_filename,
                                           output_dir,
                                           # Treat all exons as constitutive
                                           all_constitutive=True,
                                           min_size=min_exon_size)
    filter_reads = not no_bam_filter

    if filter_reads:
        print "Filtering BAM reads"
    else:
        print "Turning off filtering of BAM reads"
        
    for bam_filename in bams_to_process:
        t1 = time.time()
        output_filename = os.path.join(output_dir,
                                       "%s.insert_len" \
                                       %(os.path.basename(bam_filename)))
        if not os.path.isfile(bam_filename):
            print "Cannot find BAM file %s" %(bam_filename)
            print "Quitting..."
            sys.exit(1)
        print "Fetching reads in constitutive exons"
        mapped_bam_filename = exon_utils.map_bam2gff(bam_filename,
                                                     const_exons_gff_filename,
                                                     output_dir)
        if mapped_bam_filename == None:
            raise Exception, "Error: Insert length computation failed."

        # Load mapped BAM filename
        mapped_bam = pysam.Samfile(mapped_bam_filename, "rb")
        ###
        ### TODO: Rewrite this so that you only pair reads within an interval
        ###
        paired_reads = sam_utils.pair_sam_reads(mapped_bam,
                                                filter_reads=filter_reads)
        num_paired_reads = len(paired_reads)

        if num_paired_reads == 0:
            print "WARNING: no paired mates in %s. Skipping...\n"\
                  "Are you sure the read IDs match? If your BAM paired flags are "\
                  "unset, try using --no-bam-filter." \
                  %(bam_filename)
            continue
        print "Using %d paired mates" %(num_paired_reads)
        interval_to_paired_dists = compute_inserts_from_paired_mates(paired_reads)
        summarize_insert_len_dist(interval_to_paired_dists, output_filename,
                                  sd_max=sd_max)
        t2 = time.time()
        print "Insert length computation took %.2f seconds." %(t2 - t1)
Example #4
0
def compute_insert_len(bams_to_process,
                       const_exons_gff_filename,
                       output_dir,
                       min_exon_size,
                       no_bam_filter=False,
                       sd_max=2):
    """
    Compute insert length distribution and output it to the given
    directory.

    Arguments:

    - bams_to_process: a list of BAM files to process
    - const_gff_filename: GFF with constitutive exons
    """
    bams_str = "\n  ".join(bams_to_process)
    num_bams = len(bams_to_process)
    print "Computing insert length distribution of %d files:\n  %s" \
          %(num_bams, bams_str)
    print "  - Using const. exons from: %s" % (const_exons_gff_filename)
    print "  - Outputting to: %s" % (output_dir)
    print "  - Minimum exon size used: %d" % (min_exon_size)

    if not os.path.isdir(output_dir):
        print "Making directory: %s" % (output_dir)
        os.makedirs(output_dir)

    all_constitutive = True

    const_exons, f = \
        exon_utils.get_const_exons_by_gene(const_exons_gff_filename,
                                           output_dir,
                                           # Treat all exons as constitutive
                                           all_constitutive=True,
                                           min_size=min_exon_size)
    filter_reads = not no_bam_filter

    if filter_reads:
        print "Filtering BAM reads"
    else:
        print "Turning off filtering of BAM reads"

    for bam_filename in bams_to_process:
        t1 = time.time()
        output_filename = os.path.join(output_dir,
                                       "%s.insert_len" \
                                       %(os.path.basename(bam_filename)))
        if not os.path.isfile(bam_filename):
            print "Cannot find BAM file %s" % (bam_filename)
            print "Quitting..."
            sys.exit(1)
        print "Fetching reads in constitutive exons"
        mapped_bam_filename = exon_utils.map_bam2gff(bam_filename,
                                                     const_exons_gff_filename,
                                                     output_dir)
        if mapped_bam_filename == None:
            raise Exception, "Error: Insert length computation failed."

        # Load mapped BAM filename
        mapped_bam = pysam.Samfile(mapped_bam_filename, "rb")
        ###
        ### TODO: Rewrite this so that you only pair reads within an interval
        ###
        paired_reads = sam_utils.pair_sam_reads(mapped_bam,
                                                filter_reads=filter_reads)
        num_paired_reads = len(paired_reads)

        if num_paired_reads == 0:
            print "WARNING: no paired mates in %s. Skipping...\n"\
                  "Are you sure the read IDs match? If your BAM paired flags are "\
                  "unset, try using --no-bam-filter." \
                  %(bam_filename)
            continue
        print "Using %d paired mates" % (num_paired_reads)
        interval_to_paired_dists = compute_inserts_from_paired_mates(
            paired_reads)
        summarize_insert_len_dist(interval_to_paired_dists,
                                  output_filename,
                                  sd_max=sd_max)
        t2 = time.time()
        print "Insert length computation took %.2f seconds." % (t2 - t1)