Ejemplo n.º 1
0
def filter_strand_conflicts(bedfile):
    trees = collections.defaultdict(lambda: IntervalTree())    
    genes = []     
    for g in BedGene.parse(open(bedfile)):
        for start, end in g.exons:   
            trees[g.chrom].insert_interval(Interval(start, end, strand=g.strand, chrom=g.chrom))
        genes.append(g)
    exon_count = 0
    new_exon_count = 0
    for g in genes:
        new_exons = []
        for start, end in g.exons:
            exon_count += 1
            ambig = False
            for hit in trees[g.chrom].find(start,end):
                if hit.strand != g.strand:
                    ambig = True
                    logging.debug("Filtering ambiguous exon %s:%d-%d" % (g.chrom, start, end))
                    break
            if not ambig:
                new_exon_count += 1
                new_exons.append((start,end))
        g.exons = new_exons
    logging.info("%d exons" % (exon_count))
    logging.info("%d unambiguous exons" % (new_exon_count))         
    return genes
Ejemplo n.º 2
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--stranded", dest="stranded", action="store_true", default=False)
    parser.add_argument("--ambiguous", dest="ambiguous", action="store_true", default=False)
    parser.add_argument("--aliases", dest="alias_file", default=None)
    parser.add_argument("bed")
    parser.add_argument("track_files", nargs="+")
    options = parser.parse_args()
    
    alias_dict = {}
    alias_header = []
    if options.alias_file is not None:
        for line in open(options.alias_file):
            if line.startswith("#"):
                alias_header = line.strip()[1:].split('\t')
                continue    
            fields = line.strip().split('\t')
            alias_dict[fields[0]] = fields[1:]
    
    header_fields = alias_header + ["gene_name", "gene_interval", "gene_length"]
    tracks = []
    for track_path in options.track_files:
        name, path = track_path.split("@")
        file_path, h5_path = path.split(":")
        tf = TrackFactory(file_path, "r")
        t = tf.get_track(h5_path)
        tracks.append((name, tf, t, set(t.get_rnames())))
        if options.stranded:
            header_fields.append("%s_sense" % name)
            header_fields.append("%s_antisense" % name)
        else:
            header_fields.append(name)
    # output header
    print '\t'.join(map(str, header_fields))

    # read genes
    if options.ambiguous:
        genes = list(BedGene.parse(open(options.bed)))
    else:
        genes = filter_strand_conflicts(options.bed)
    # get counts
    for g in genes:
        alias_fields = alias_dict.get(g.name, ["None"] * len(alias_header))
        fields = ([g.name] + alias_fields +
                  ["%s[%s]:%d-%d" % (g.chrom, g.strand, g.tx_start, g.tx_end),
                   sum((end-start) for start,end in g.exons)])
        sense_strand = NEG_STRAND if g.strand == "+" else POS_STRAND
        antisense_strand = int(not sense_strand)
        rname_found = False
        for name, tf, t, rnames in tracks:
            if g.chrom not in rnames:
                continue
            rname_found = True        
            if options.stranded:
                sense_count = 0
                antisense_count = 0
                for start, end in g.exons:
                    sense_count += t.count((g.chrom, start, end, sense_strand))
                    antisense_count += t.count((g.chrom, start, end, antisense_strand))
                fields.append(sense_count)
                fields.append(antisense_count)
            else:
                count = 0
                for start, end in g.exons:
                    count += t.count((g.chrom, start, end))
                fields.append(count)
        if rname_found:
            print '\t'.join(map(str, fields))

    for name,tf,t,rnames in tracks:
        tf.close()