Example #1
0
def filter_chimeras(input_file, output_file, index_dir, bam_file,
                    weighted_unique_frags, median_isize, max_isize,
                    isoform_fraction, false_pos_file):
    logging.debug("Filtering Parameters")
    logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags))
    logging.debug("\tmedian insert size: %d" % (median_isize))
    logging.debug("\tmax insert size allowed: %d" % (max_isize))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Parsing false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Checking chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt")
    f = open(tmp_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_weighted_frags(c, weighted_unique_frags)
        if not good:
            continue
        good = good and filter_inner_dist(c, max_isize)
        if not good:
            continue
        false_pos_key = (c.partner5p.tx_name, c.partner5p.end,
                         c.partner3p.tx_name, c.partner3p.start)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(
            c, isoform_fraction, median_isize, bamfh)
        if good:
            print >> f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(tmp_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras)
    os.remove(tmp_file)
    return config.JOB_SUCCESS
def filter_spanning_chimeras(input_file, output_file, gene_file,
                             mate_pval, max_isize):
    '''
    processes chimera isoforms and chooses the one with the 
    highest coverage and omits the rest
    '''
    # apply more filters
    tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe')
    fh = open(tmpfile, "w")
    for c in SpanningChimera.parse(open(input_file)):
        res = filter_insert_size(c, max_isize)
        if res:
            print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))])
    fh.close()
    # choose best isoform from remaining isoforms
    logging.debug("Building gene/genome index")
    ggmap = build_gene_to_genome_map(open(gene_file))
    logging.debug("Choosing highest coverage chimeras")
    fh = open(output_file, "w")
    for c in choose_highest_coverage_chimeras(tmpfile, ggmap):
        print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))])
    fh.close()
    # remove temporary file
    os.remove(tmpfile)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval,
                             max_isize):
    '''
    processes chimera isoforms and chooses the one with the 
    highest coverage and omits the rest
    '''
    # apply more filters
    tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe')
    fh = open(tmpfile, "w")
    for c in SpanningChimera.parse(open(input_file)):
        res = filter_insert_size(c, max_isize)
        if res:
            print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))])
    fh.close()
    # choose best isoform from remaining isoforms
    logging.debug("Building gene/genome index")
    ggmap = build_gene_to_genome_map(open(gene_file))
    logging.debug("Choosing highest coverage chimeras")
    fh = open(output_file, "w")
    for c in choose_highest_coverage_chimeras(tmpfile, ggmap):
        print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))])
    fh.close()
    # remove temporary file
    os.remove(tmpfile)
def filter_chimeras(input_file, output_file,
                    index_dir, bam_file,
                    weighted_unique_frags,
                    median_isize,
                    max_isize,
                    isoform_fraction,
                    false_pos_file):
    logging.debug("Filtering Parameters")
    logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags))
    logging.debug("\tmedian insert size: %d" % (median_isize))
    logging.debug("\tmax insert size allowed: %d" % (max_isize))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Parsing false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Checking chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt")
    f = open(tmp_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_weighted_frags(c, weighted_unique_frags)
        if not good:
            continue
        good = good and filter_inner_dist(c, max_isize)
        if not good:
            continue            
        false_pos_key = (c.partner5p.tx_name, c.partner5p.end, 
                         c.partner3p.tx_name, c.partner3p.start)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, median_isize, bamfh)        
        if good:
            print >>f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(tmp_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % 
                  num_filtered_chimeras)
    os.remove(tmp_file)
    return config.JOB_SUCCESS