def add_gtf_file(gtf_file, outfh, is_ref, sample_id=None):
    refval = '1' if is_ref else '0'
    for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file):
        logging.debug("\tfinished chrom %s %d features" % (chrom, len(exon_dict)))
        # output reference transcripts
        for t_id, features in exon_dict.iteritems():
            # sort features (exons) by start position
            features.sort(key=operator.attrgetter('start'))
            # annotate exons as reference features
            for f in features:
                f.attrs[GTFAttr.REF] = refval
                print >>outfh, str(f)
            # transcript feature
            if t_id in transcript_dict:
                f = transcript_dict[t_id]
            else:
                f = GTFFeature()
                f.seqid = features[0].seqid
                f.source = features[0].source
                f.feature_type = 'transcript'
                f.start = features[0].start
                f.end = features[-1].end
                f.score = features[0].score
                f.strand = features[0].strand
                f.phase = '.'
                f.attrs = features[0].attrs.copy()
                if "exon_number" in f.attrs:
                    del f.attrs["exon_number"]
            f.attrs[GTFAttr.REF] = refval
            if sample_id is not None:
                f.attrs[GTFAttr.SAMPLE_ID] = sample_id
            print >>outfh, str(f)
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', '--attr', dest='attrs', action='append')
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    gtf_file = args.gtf_file
    comparisons = []
    for attr in args.attrs:
        key, op, value = attr.split()
        if (op == '=') or (op == '=='):
            func = lambda a, b: (a == b)
        elif (op == '!') or (op == '!='):
            func = lambda a, b: (a != b)
        else:
            assert False
        comparisons.append((key, value, func))
    for f in GTFFeature.parse(open(gtf_file)):
        match = True
        for k, v, func in comparisons:
            if not func(v, f.attrs.get(k, None)):
                match = False
                break
        if match:
            print str(f)
Exemple #3
0
def get_gtf_metadata(gtf_file, gtf_attrs):
    if gtf_attrs is None:
        gtf_attrs = []
    if 'transcript_id' in gtf_attrs:
        gtf_attrs.remove('transcript_id')
    # read gtf file
    metadata_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs["transcript_id"]
        if t_id not in metadata_dict:
            # instantiate new metadata
            m = TranscriptMetadata()
            m.chrom = feature.seqid
            m.strand = feature.strand
            m.start = feature.start
            m.end = feature.end
            for gtf_attr in gtf_attrs:
                setattr(m, gtf_attr, feature.attrs.get(gtf_attr, ''))
            metadata_dict[t_id] = m
        else:
            m = metadata_dict[t_id]
        # update metadata
        m.start = feature.start if feature.start < m.start else m.start
        m.end = feature.end if feature.end > m.end else m.end
        m.length += (feature.end - feature.start)
        m.num_exons += 1
    return metadata_dict
def get_gtf_metadata(gtf_file, gtf_attrs):
    if gtf_attrs is None:
        gtf_attrs = []
    if 'transcript_id' in gtf_attrs:
        gtf_attrs.remove('transcript_id')
    # read gtf file
    metadata_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs["transcript_id"]
        if t_id not in metadata_dict:
            # instantiate new metadata
            m = TranscriptMetadata()
            m.chrom = feature.seqid
            m.strand = feature.strand
            m.start = feature.start
            m.end = feature.end
            for gtf_attr in gtf_attrs:
                setattr(m, gtf_attr, feature.attrs.get(gtf_attr, ''))
            metadata_dict[t_id] = m
        else:
            m = metadata_dict[t_id]
        # update metadata
        m.start = feature.start if feature.start < m.start else m.start
        m.end = feature.end if feature.end > m.end else m.end
        m.length += (feature.end - feature.start)
        m.num_exons += 1
    return metadata_dict
Exemple #5
0
def read_gtf_file(filename, library_id):
    # read all transcripts
    t_dict = collections.OrderedDict()
    cur_t_id = 1
    cur_g_id = 1
    t_id_map = {}
    g_id_map = {}
    for feature in GTFFeature.parse(open(filename)):
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        # rename transcript id
        if t_id not in t_id_map:
            new_t_id = "%s.T%d" % (library_id, cur_t_id)
            t_id_map[t_id] = new_t_id
            cur_t_id += 1
        else:
            new_t_id = t_id_map[t_id]
        # rename gene id
        g_id = feature.attrs[GTFAttr.GENE_ID]
        if g_id not in g_id_map:
            new_g_id = "%s.G%d" % (library_id, cur_g_id)
            g_id_map[g_id] = new_g_id
            cur_g_id += 1
        else:
            new_g_id = g_id_map[g_id]
        # update transcript attributes
        feature.attrs[GTFAttr.TRANSCRIPT_ID] = new_t_id
        feature.attrs[GTFAttr.GENE_ID] = new_g_id
        # store feature
        if new_t_id not in t_dict:
            t_dict[new_t_id] = []
        t_dict[new_t_id].append(feature)
    return t_dict
Exemple #6
0
def read_gtf_file(filename, library_id):
    # read all transcripts
    t_dict = collections.OrderedDict()
    cur_t_id = 1
    cur_g_id = 1
    t_id_map = {}
    g_id_map = {}
    for feature in GTFFeature.parse(open(filename)):
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        # rename transcript id
        if t_id not in t_id_map:
            new_t_id = "%s.T%d" % (library_id, cur_t_id)
            t_id_map[t_id] = new_t_id
            cur_t_id += 1
        else:
            new_t_id = t_id_map[t_id]
        # rename gene id
        g_id = feature.attrs[GTFAttr.GENE_ID]
        if g_id not in g_id_map:
            new_g_id = "%s.G%d" % (library_id, cur_g_id)
            g_id_map[g_id] = new_g_id
            cur_g_id += 1
        else:
            new_g_id = g_id_map[g_id]
        # update transcript attributes
        feature.attrs[GTFAttr.TRANSCRIPT_ID] = new_t_id
        feature.attrs[GTFAttr.GENE_ID] = new_g_id
        # store feature
        if new_t_id not in t_dict:
            t_dict[new_t_id] = []
        t_dict[new_t_id].append(feature)
    return t_dict
def main():
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--attr", dest="attrs", action="append")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    gtf_file = args.gtf_file
    comparisons = []
    for attr in args.attrs:
        key, op, value = attr.split()
        if (op == "=") or (op == "=="):
            func = lambda a, b: (a == b)
        elif (op == "!") or (op == "!="):
            func = lambda a, b: (a != b)
        else:
            assert False
        comparisons.append((key, value, func))
    for f in GTFFeature.parse(open(gtf_file)):
        match = True
        for k, v, func in comparisons:
            if not func(v, f.attrs.get(k, None)):
                match = False
                break
        if match:
            print str(f)
def read_reference_gtf(ref_gtf_file):
    gene_map = {}
    for f in GTFFeature.parse(open(ref_gtf_file)):
        # get gene by id
        gene_id = f.attrs["gene_id"]
        if gene_id not in gene_map:
            g = Gene()
            g.gene_id = gene_id
            g.chrom = f.seqid
            g.strand = f.strand
            g.gene_start = f.start
            g.gene_end = f.end
            gene_map[gene_id] = g
        else:
            g = gene_map[gene_id]
        # update gene
        g.gene_start = min(g.gene_start, f.start)
        g.gene_end = max(g.gene_end, f.end)
        if f.feature_type == "exon":
            g.exons.add((f.start, f.end))
        elif f.feature_type == "CDS":
            g.is_coding = True
        if "gene_name" in f.attrs:
            g.gene_names.add(f.attrs["gene_name"])
        g.annotation_sources.add(f.source)
    logging.info("Sorting genes")
    genes = sorted(gene_map.values(),
                   key=operator.attrgetter('chrom', 'gene_start'))
    del gene_map
    # cluster loci
    logging.debug("Building interval index")
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for i, g in enumerate(genes):
        locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i)
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            # cluster gene exons and add to interval tree
            exon_tree = IntervalTree()
            for i in indexes:
                g = genes[i]
                cluster_tree = ClusterTree(0, 1)
                for start, end in g.exons:
                    cluster_tree.insert(start, end, 1)
                # update exons
                exon_clusters = []
                for start, end, indexes in cluster_tree.getregions():
                    exon_clusters.append((start, end))
                g.exons = exon_clusters
                del cluster_tree
                for start, end in g.exons:
                    exon_tree.insert_interval(Interval(start, end, value=g))
            # add to locus interval tree
            locus_trees[chrom].insert_interval(
                Interval(locus_start, locus_end, value=exon_tree))
    logging.debug("Done indexing reference GTF file")
    return locus_trees
def read_reference_gtf(ref_gtf_file):
    gene_map = {}
    for f in GTFFeature.parse(open(ref_gtf_file)):
        # get gene by id
        gene_id = f.attrs["gene_id"]
        if gene_id not in gene_map:
            g = Gene()
            g.gene_id = gene_id
            g.chrom = f.seqid 
            g.strand = f.strand
            g.gene_start = f.start
            g.gene_end = f.end
            gene_map[gene_id] = g
        else:
            g = gene_map[gene_id]
        # update gene
        g.gene_start = min(g.gene_start, f.start)
        g.gene_end = max(g.gene_end, f.end)
        if f.feature_type == "exon":
            g.exons.add((f.start, f.end))
        elif f.feature_type == "CDS":
            g.is_coding = True
        if "gene_name" in f.attrs:                    
            g.gene_names.add(f.attrs["gene_name"])
        g.annotation_sources.add(f.source)
    logging.info("Sorting genes")
    genes = sorted(gene_map.values(), key=operator.attrgetter('chrom', 'gene_start'))
    del gene_map
    # cluster loci
    logging.debug("Building interval index")
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))
    for i,g in enumerate(genes):
        locus_cluster_trees[g.chrom].insert(g.gene_start, g.gene_end, i)
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for chrom, cluster_tree in locus_cluster_trees.iteritems(): 
        for locus_start,locus_end,indexes in cluster_tree.getregions():
            # cluster gene exons and add to interval tree
            exon_tree = IntervalTree()
            for i in indexes:
                g = genes[i]
                cluster_tree = ClusterTree(0,1)
                for start,end in g.exons:
                    cluster_tree.insert(start, end, 1)
                # update exons
                exon_clusters = []
                for start,end,indexes in cluster_tree.getregions():
                    exon_clusters.append((start,end))
                g.exons = exon_clusters
                del cluster_tree
                for start,end in g.exons:
                    exon_tree.insert_interval(Interval(start, end, value=g))
            # add to locus interval tree
            locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=exon_tree))
    logging.debug("Done indexing reference GTF file")
    return locus_trees
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", action="store_true", 
                        dest="verbose", default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")   
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >>outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >>outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)]
            print '\t'.join(fields)    
    return 0
Exemple #11
0
def get_gtf_metadata(gtf_file, 
                      omit_attrs=None,
                      group_by="gene_id", 
                      feature_type="exon"):
    if omit_attrs is None:
        omit_attrs = []
    # read gtf file and group by gene
    gene_feature_map = collections.defaultdict(lambda: [])
    gene_attrs_set = set()
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != feature_type:
            continue
        feature_id = feature.attrs[group_by]
        gene_feature_map[feature_id].append(feature)
        gene_attrs_set.update(feature.attrs.keys())
    gene_attrs_set.difference_update(omit_attrs)
    gene_attrs_list = sorted(gene_attrs_set)
    metadata_fields = ["tracking_id", "locus", "strand", "num_exons", "transcript_length"] + gene_attrs_list
    metadata_inds = dict((x,i) for i,x in enumerate(metadata_fields))
    metadata_dict = {}
    # output metadata sorted by gene id
    for feature_id,features in gene_feature_map.iteritems():
        # collect attributes for this gene
        attrdict = collections.defaultdict(lambda: set())
        # cluster exons together for each gene
        cluster_tree = ClusterTree(0,1)
        for i,f in enumerate(features):
            cluster_tree.insert(f.start, f.end, i)
            for k,v in f.attrs.iteritems():
                if k in gene_attrs_set:
                    # some attributes have multiple values separated by a comma
                    attrdict[k].update(v.split(','))
        # determine larger exon clusters
        transcript_length = 0
        exon_clusters = []
        for start, end, indexes in cluster_tree.getregions():
            exon_clusters.append((start,end))
            transcript_length += (end - start)
        del cluster_tree
        chrom = features[0].seqid
        locus_start = min(e[0] for e in exon_clusters)
        locus_end = max(e[1] for e in exon_clusters)
        locus_string = "%s:%d-%d" % (chrom, locus_start, locus_end)
        strand = features[0].strand
        num_exons = len(exon_clusters)
        # make metadata row
        metadata = [feature_id, locus_string, strand, num_exons, transcript_length] + ['NA'] * len(gene_attrs_list)
        # get all attributes
        for k,vals in attrdict.iteritems():
            ind = metadata_inds[k]
            metadata[ind] = ','.join(map(str, sorted(vals)))
        metadata_dict[metadata[0]] = metadata
    return metadata_fields, metadata_dict
def main():
    # setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file %s not found" % (args.gtf_file))
    cur_t_id = 1
    cur_g_id = 1
    cur_tss_id = 1
    t_id_map = {}
    g_id_map = {}
    tss_id_map = {}
    for feature in GTFFeature.parse(open(args.gtf_file)):
        t_id = feature.attrs['transcript_id']
        g_id = feature.attrs['gene_id']
        tss_id = feature.attrs['tss_id']
        if t_id not in t_id_map:
            new_t_id = "T%06d" % (cur_t_id)
            t_id_map[t_id] = new_t_id
            cur_t_id += 1
        else:
            new_t_id = t_id_map[t_id]
        if g_id not in g_id_map:
            new_g_id = "G%06d" % (cur_g_id)
            g_id_map[g_id] = new_g_id
            cur_g_id += 1
        else:
            new_g_id = g_id_map[g_id]
        if tss_id not in tss_id_map:
            new_tss_id = "TSS%06d" % (cur_tss_id)
            tss_id_map[tss_id] = new_tss_id
            cur_tss_id += 1
        else:
            new_tss_id = tss_id_map[tss_id]
        # update transcript attributes
        new_attrs = {
            'transcript_id': new_t_id,
            'gene_id': new_g_id,
            'tss_id': new_tss_id
        }
        if 'exon_number' in feature.attrs:
            new_attrs['exon_number'] = feature.attrs['exon_number']
        feature.attrs = new_attrs
        print feature
    return 0
def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file %s not found" % (args.gtf_file))
    cur_t_id = 1
    cur_g_id = 1
    cur_tss_id = 1
    t_id_map = {}
    g_id_map = {}
    tss_id_map = {}
    for feature in GTFFeature.parse(open(args.gtf_file)):
        t_id = feature.attrs['transcript_id']
        g_id = feature.attrs['gene_id']
        tss_id = feature.attrs['tss_id']
        if t_id not in t_id_map:
            new_t_id = "T%06d" % (cur_t_id)
            t_id_map[t_id] = new_t_id
            cur_t_id += 1
        else:
            new_t_id = t_id_map[t_id]
        if g_id not in g_id_map:
            new_g_id = "G%06d" % (cur_g_id)
            g_id_map[g_id] = new_g_id
            cur_g_id += 1
        else:
            new_g_id = g_id_map[g_id]
        if tss_id not in tss_id_map:
            new_tss_id = "TSS%06d" % (cur_tss_id)
            tss_id_map[tss_id] = new_tss_id
            cur_tss_id += 1
        else:
            new_tss_id = tss_id_map[tss_id]
        # update transcript attributes
        new_attrs = {'transcript_id': new_t_id,
                     'gene_id': new_g_id,
                     'tss_id': new_tss_id}
        if 'exon_number' in feature.attrs:
            new_attrs['exon_number'] = feature.attrs['exon_number']
        feature.attrs = new_attrs
        print feature
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--group-by", dest="group_by", default="gene_id")
    parser.add_argument("gtf_file")
    parser.add_argument("gtf_attr")
    args = parser.parse_args()
    d = collections.defaultdict(lambda: set())
    for f in GTFFeature.parse(open(args.gtf_file)):
        if args.gtf_attr in f.attrs:
            v = f.attrs[args.gtf_attr]
            f_id = f.attrs[args.group_by]
            d[v].add(f_id)
    for k in sorted(d):
        print k, len(d[k])
Exemple #15
0
 def parse_gtf(filename):
     # read all transcripts
     t_dict = collections.defaultdict(lambda: [])
     i = 0
     for f in GTFFeature.parse(open(filename)):
         i += 1
         if (i % 100000) == 0:
             logging.debug('parse_gtf read %d lines' % (i))
         if f.feature_type != 'exon':
             continue
         t_id = f.attrs['transcript_id']
         t_dict[t_id].append(f)
     i = 0
     for features in t_dict.itervalues():
         yield Feature.from_gtf_features(features)
         i += 1
     logging.debug('Parsed %d transcripts' % (i))
Exemple #16
0
 def parse_gtf(filename):
     # read all transcripts
     t_dict = collections.defaultdict(lambda: [])
     i = 0
     for f in GTFFeature.parse(open(filename)):
         i += 1
         if (i % 100000) == 0:
             logging.debug('parse_gtf read %d lines' % (i))
         if f.feature_type != 'exon':
             continue
         t_id = f.attrs['transcript_id']
         t_dict[t_id].append(f)
     i = 0
     for features in t_dict.itervalues():
         yield Feature.from_gtf_features(features)
         i += 1
     logging.debug('Parsed %d transcripts' % (i))
Exemple #17
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--group-by", dest="group_by", default="gene_id")
    parser.add_argument("gtf_file")
    parser.add_argument("gtf_attr")
    args = parser.parse_args()
    d = collections.defaultdict(lambda: set())
    for f in GTFFeature.parse(open(args.gtf_file)):
        if args.gtf_attr in f.attrs:
            v = f.attrs[args.gtf_attr]
            f_id = f.attrs[args.group_by]
            d[v].add(f_id)
    for k in sorted(d):
        print k, len(d[k])
def _parse_gtf_by_chrom(gtf_file):
    current_chrom = None
    exon_dict = collections.defaultdict(lambda: [])
    transcript_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if (current_chrom != feature.seqid):
            if len(exon_dict) > 0:
                yield current_chrom, transcript_dict, exon_dict
                exon_dict = collections.defaultdict(lambda: [])
                transcript_dict = {}
            current_chrom = feature.seqid
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        if feature.feature_type == "transcript":
            transcript_dict[t_id] = feature
        elif feature.feature_type == "exon":
            exon_dict[t_id].append(feature)
    if len(exon_dict) > 0:
        yield current_chrom, transcript_dict, exon_dict
def _parse_gtf_by_chrom(gtf_file):
    current_chrom = None
    exon_dict = collections.defaultdict(lambda: [])
    transcript_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if (current_chrom != feature.seqid):
            if len(exon_dict) > 0:
                yield current_chrom, transcript_dict, exon_dict
                exon_dict = collections.defaultdict(lambda: [])
                transcript_dict = {}
            current_chrom = feature.seqid
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        if feature.feature_type == "transcript":
            transcript_dict[t_id] = feature
        elif feature.feature_type == "exon":
            exon_dict[t_id].append(feature)
    if len(exon_dict) > 0:
        yield current_chrom, transcript_dict, exon_dict
def split_gtf_file(gtf_file,
                   split_dir,
                   ref_gtf_file,
                   category_stats_file,
                   bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, 'w')
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == 'transcript':
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, '\t'.join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, '\t'.join(map(str, fields))
    fh.close()
def classify_library_transcripts(args):
    library_id, output_dir = args
    prefix = os.path.join(output_dir, library_id)
    # input files
    input_gtf_file = prefix + ".gtf"
    logfile = prefix + ".log"
    tablefile = prefix + '.inp.txt'
    # output files
    #info_file = prefix + ".info.txt"
    output_res_file = prefix + ".out.txt"
    expr_gtf_file = prefix + ".expr.gtf"
    bkgd_gtf_file = prefix + ".bkgd.gtf"
    # write table of observations
    logging.debug("[STARTED]  library_id='%s'" % (library_id))
    write_transcript_table(input_gtf_file, tablefile)
    # run R script to do classification
    logfh = open(logfile, "w")
    retcode = subprocess.call(
        ["Rscript", "--vanilla", CLASSIFY_R_SCRIPT, prefix],
        stdout=logfh,
        stderr=logfh)
    logfh.close()
    if retcode != 0:
        logging.error("[FAILED]   library_id='%s'" % (library_id))
        return retcode, library_id
    # get library stats
    #info_field_dict = read_classify_info(info_file)
    #has_tests = int(info_field_dict["tests"][0]) > 0
    # get transcript predictions
    decision_dict = read_classify_decisions(output_res_file)
    # partition input into expressed vs background
    expr_fileh = open(expr_gtf_file, 'w')
    bkgd_fileh = open(bkgd_gtf_file, 'w')
    output_file_handles = [bkgd_fileh, expr_fileh]
    for feature in GTFFeature.parse(open(input_gtf_file)):
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        dinf = decision_dict[t_id]
        feature.attrs[GTFAttr.LOG10LR] = dinf.log10lr
        fileh = output_file_handles[int(dinf.pred)]
        print >> fileh, str(feature)
    for fileh in output_file_handles:
        fileh.close()
    logging.debug("[FINISHED] library_id='%s'" % (library_id))
    return retcode, library_id
Exemple #22
0
def get_gtf_metadata(gtf_file, metadata_file):
    # read gtf file and group by gene
    metadata_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs["transcript_id"]
        if t_id not in metadata_dict:
            # instantiate new metadata
            m = TranscriptMetadata()
            m.chrom = feature.seqid
            m.strand = feature.strand
            m.start = feature.start
            m.end = feature.end
            for attr in GTF_ATTRS_SET:
                setattr(m, attr, feature.attrs.get(attr, ''))
            metadata_dict[t_id] = m
        else:
            m = metadata_dict[t_id]
        # update metadata
        m.start = feature.start if feature.start < m.start else m.start
        m.end = feature.end if feature.end > m.end else m.end
        m.length += (feature.end - feature.start)
        m.num_exons += 1
    fileh = open(metadata_file, 'w')
    header_fields = [
        'tracking_id', 'locus', 'strand', 'num_exons', 'transcript_length'
    ] + GTF_ATTRS
    print >> fileh, '\t'.join(header_fields)
    tracking_ids = sorted(metadata_dict)
    for t_id in tracking_ids:
        m = metadata_dict[t_id]
        fields = [
            t_id,
            '%s:%d-%d' % (m.chrom, m.start, m.end), m.strand, m.num_exons,
            m.length
        ]
        for attr in GTF_ATTRS:
            fields.append(getattr(m, attr))
        print >> fileh, '\t'.join(map(str, fields))
    fileh.close()
    return tracking_ids
def classify_library_transcripts(args):
    library_id, output_dir = args
    prefix = os.path.join(output_dir, library_id)
    # input files
    input_gtf_file = prefix + ".gtf"
    logfile = prefix + ".log"
    tablefile = prefix + ".inp.txt"
    # output files
    # info_file = prefix + ".info.txt"
    output_res_file = prefix + ".out.txt"
    expr_gtf_file = prefix + ".expr.gtf"
    bkgd_gtf_file = prefix + ".bkgd.gtf"
    # write table of observations
    logging.debug("[STARTED]  library_id='%s'" % (library_id))
    write_transcript_table(input_gtf_file, tablefile)
    # run R script to do classification
    logfh = open(logfile, "w")
    retcode = subprocess.call(["Rscript", "--vanilla", CLASSIFY_R_SCRIPT, prefix], stdout=logfh, stderr=logfh)
    logfh.close()
    if retcode != 0:
        logging.error("[FAILED]   library_id='%s'" % (library_id))
        return retcode, library_id
    # get library stats
    # info_field_dict = read_classify_info(info_file)
    # has_tests = int(info_field_dict["tests"][0]) > 0
    # get transcript predictions
    decision_dict = read_classify_decisions(output_res_file)
    # partition input into expressed vs background
    expr_fileh = open(expr_gtf_file, "w")
    bkgd_fileh = open(bkgd_gtf_file, "w")
    output_file_handles = [bkgd_fileh, expr_fileh]
    for feature in GTFFeature.parse(open(input_gtf_file)):
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        dinf = decision_dict[t_id]
        feature.attrs[GTFAttr.LOG10LR] = dinf.log10lr
        fileh = output_file_handles[int(dinf.pred)]
        print >> fileh, str(feature)
    for fileh in output_file_handles:
        fileh.close()
    logging.debug("[FINISHED] library_id='%s'" % (library_id))
    return retcode, library_id
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, "w")
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == "transcript":
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, "\t".join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, "\t".join(map(str, fields))
    fh.close()
def get_gtf_metadata(gtf_file, metadata_file):
    # read gtf file and group by gene
    metadata_dict = {}
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs["transcript_id"]
        if t_id not in metadata_dict:
            # instantiate new metadata
            m = TranscriptMetadata()
            m.chrom = feature.seqid
            m.strand = feature.strand
            m.start = feature.start
            m.end = feature.end
            for attr in GTF_ATTRS_SET:
                setattr(m, attr, feature.attrs.get(attr, ''))
            metadata_dict[t_id] = m
        else:
            m = metadata_dict[t_id]
        # update metadata
        m.start = feature.start if feature.start < m.start else m.start
        m.end = feature.end if feature.end > m.end else m.end
        m.length += (feature.end - feature.start)
        m.num_exons += 1
    fileh = open(metadata_file, 'w')
    header_fields = ['tracking_id', 'locus', 'strand', 'num_exons', 'transcript_length'] + GTF_ATTRS
    print >>fileh, '\t'.join(header_fields)
    tracking_ids = sorted(metadata_dict)
    for t_id in tracking_ids:
        m = metadata_dict[t_id]
        fields = [t_id,
                  '%s:%d-%d' % (m.chrom, m.start, m.end),
                  m.strand,
                  m.num_exons,
                  m.length]
        for attr in GTF_ATTRS:
            fields.append(getattr(m, attr))
        print >>fileh, '\t'.join(map(str, fields))
    fileh.close()
    return tracking_ids
Exemple #26
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--source', dest='source', default=None)
    parser.add_argument('--remove', dest='remove', action='append', default=[])
    parser.add_argument('-f', dest='force', action='store_true')
    parser.add_argument('--add', dest='add', action='append', default=[])
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    source = args.source
    force = args.force
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    add_attrs = []
    for arg in args.add:
        k, v = arg.split(',')
        add_attrs.append((k, v))
    rm_attrs = []
    for arg in args.remove:
        rm_attrs.append(arg)
    i = 0
    for f in GTFFeature.parse(open(args.gtf_file)):
        if source is not None:
            f.source = source
        for k, v in add_attrs:
            if (k in f.attrs) and not force:
                parser.error('attribute %s already in feature' % (k))
            f.attrs[k] = v
        for k in rm_attrs:
            if k in f.attrs:
                del f.attrs[k]
        print str(f)
        i += 1
        if (i % 100000) == 0:
            logging.debug('finished %d lines' % (i))
    return 0
Exemple #27
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--source', dest='source', default=None)
    parser.add_argument('--remove', dest='remove', action='append', default=[])
    parser.add_argument('-f', dest='force', action='store_true')
    parser.add_argument('--add', dest='add', action='append', default=[])
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    source = args.source
    force = args.force
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    add_attrs = []
    for arg in args.add:
        k,v = arg.split(',')
        add_attrs.append((k,v))
    rm_attrs = []
    for arg in args.remove:
        rm_attrs.append(arg)
    i = 0
    for f in GTFFeature.parse(open(args.gtf_file)):
        if source is not None:
            f.source = source
        for k,v in add_attrs:
            if (k in f.attrs) and not force:
                parser.error('attribute %s already in feature' % (k))
            f.attrs[k] = v
        for k in rm_attrs:
            if k in f.attrs:
                del f.attrs[k]
        print str(f)
        i += 1
        if (i % 100000) == 0:
            logging.debug('finished %d lines' % (i))
    return 0
Exemple #28
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--add', dest='add', action='append', default=[])
    parser.add_argument("metadata_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    add_attrs = set()
    for arg in args.add:
        add_attrs.add(arg)
    add_attrs = sorted(add_attrs)

    # read metadata
    meta_dict = {}
    with open(args.metadata_file) as f:
        header_fields = f.next().strip().split('\t')
        t_id_index = header_fields.index('transcript_id')
        header_indexes = [header_fields.index(x) for x in add_attrs]

        for line in f:
            fields = line.strip().split('\t')
            meta_dict[fields[t_id_index]] = [fields[i] for i in header_indexes]
            #print fields[t_id_index], meta_dict[fields[t_id_index]]

    # read GTF
    for f in GTFFeature.parse(open(args.gtf_file)):
        t_id = f.attrs['transcript_id']

        if t_id in meta_dict:
            for k, v in zip(add_attrs, meta_dict[t_id]):
                f.attrs[k] = v
        else:
            for k in add_attrs:
                f.attrs[k] = 'NA'
        print str(f)
    return 0
def read_gtf_file(library, gtf_score_attr):
    # read all transcripts
    t_dict = collections.OrderedDict()
    cur_t_id = 1
    cur_g_id = 1
    t_id_map = {}
    g_id_map = {}
    for feature in GTFFeature.parse(open(library.gtf_file)):
        if feature.feature_type == "exon":
            t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
            # rename transcript id
            if t_id not in t_id_map:
                new_t_id = "%s.T%d" % (library.library_id, cur_t_id)
                t_id_map[t_id] = new_t_id
                cur_t_id += 1
            else:
                new_t_id = t_id_map[t_id]
            # rename gene id
            g_id = feature.attrs[GTFAttr.GENE_ID]
            if g_id not in g_id_map:
                new_g_id = "%s.G%d" % (library.library_id, cur_g_id)
                g_id_map[g_id] = new_g_id
                cur_g_id += 1
            else:
                new_g_id = g_id_map[g_id]
            # update transcript attributes
            newattrs = {GTFAttr.TRANSCRIPT_ID: new_t_id,
                        GTFAttr.GENE_ID: new_g_id,
                        GTFAttr.SAMPLE_ID: library.sample_id,
                        GTFAttr.LIBRARY_ID: library.library_id,
                        GTFAttr.REF: '0',
                        GTFAttr.SCORE: feature.attrs.get(gtf_score_attr, '0.0')}
            feature.attrs = newattrs
            # store feature
            if new_t_id not in t_dict:
                t_dict[new_t_id] = []
            t_dict[new_t_id].append(feature)
    return t_dict
def add_reference_gtf_file(ref_gtf_file, test_gene_ids, 
                           random_test_frac, outfh):
    gene_dict = collections.defaultdict(lambda: [])
    user_defined_tests = len(test_gene_ids) > 0
    # group by gene id
    for feature in GTFFeature.parse(open(ref_gtf_file)):
        if feature.feature_type != "exon":
            continue
        # group by gene id        
        g_id = feature.attrs[GTFAttr.GENE_ID]
        gene_dict[g_id].append(feature)
    # output reference transcripts
    for g_id, g_features in gene_dict.iteritems():
        # label test transcripts
        if user_defined_tests:
            is_test = (g_id in test_gene_ids)
        else:
            is_test = (random.random() < random_test_frac)
        # group by transcript id
        transcript_dict = collections.defaultdict(lambda: [])
        for feature in g_features:
            t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
            transcript_dict[t_id].append(feature)
        for t_id, t_features in transcript_dict.iteritems():
            # sort features (exons) by start position
            t_features.sort(key=operator.attrgetter('start'))
            # annotate exons as reference features
            for f in t_features:
                f.attrs[GTFAttr.REF] = '1'
                f.attrs[GTFAttr.TEST] = '1' if is_test else '0'
                print >>outfh, str(f)
            f = make_transcript_feature(t_features)
            f.attrs[GTFAttr.REF] = '1'
            f.attrs[GTFAttr.TEST] = '1' if is_test else '0'
            print >>outfh, str(f)
        del transcript_dict
    del gene_dict
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--add', dest='add', action='append', default=[])
    parser.add_argument("metadata_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    add_attrs = set()
    for arg in args.add:
        add_attrs.add(arg)
    add_attrs = sorted(add_attrs)

    # read metadata
    meta_dict = {}
    with open(args.metadata_file) as f:
        header_fields = f.next().strip().split('\t')
        t_id_index = header_fields.index('transcript_id')
        header_indexes = [header_fields.index(x) for x in add_attrs]
        
        for line in f:
            fields = line.strip().split('\t')
            meta_dict[fields[t_id_index]] = [fields[i] for i in header_indexes]
            #print fields[t_id_index], meta_dict[fields[t_id_index]]
    
    # read GTF
    for f in GTFFeature.parse(open(args.gtf_file)):
        t_id = f.attrs['transcript_id']
        
        if t_id in meta_dict:
            for k,v in zip(add_attrs, meta_dict[t_id]):
                f.attrs[k] = v
        else:
            for k in add_attrs:
                f.attrs[k] = 'NA'
        print str(f)
    return 0
def annotate_gtf(gtf_file, bed_dbs):
    # read reference databases
    bed_trees = []
    for name,filename in bed_dbs:
        logging.debug("Loading BED db '%s' file '%s'" % (name,filename))
        trees = build_interval_tree_from_bed(filename)
        bed_trees.append((name, trees))
    # parse gtf file and annotate
    logging.debug("Annotating GTF")
    for lines in parse_loci(open(gtf_file)):
        features = []
        transcripts = []
        transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set()))
        for line in lines:
            f = GTFFeature.from_string(line)
            features.append(f)
            t_id = f.attrs['transcript_id']
            if f.feature_type == 'transcript':
                transcripts.append(f)
            elif f.feature_type == 'exon':
                for dbname,dbtrees in bed_trees:
                    # intersect this exon with features
                    hits = dbtrees[f.seqid].find(f.start, f.end)                        
                    matches = set(hit.value for hit in hits if hit.strand == f.strand)
                    f.attrs[dbname] = ','.join(sorted(matches))
                    # update transcript level matches
                    transcript_matches[t_id][dbname].update(matches)
        # set transcript annotations
        for f in transcripts:
            t_id = f.attrs['transcript_id']
            for dbname,dbtrees in bed_trees:
                matches = transcript_matches[t_id][dbname]
                f.attrs[dbname] = ','.join(sorted(matches))
        # write features
        for f in features:
            print str(f)
    logging.debug("Done")
Exemple #33
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    gtf_file = args.gtf_file
    genes = set()
    transcript_dict = {}
    exon_dict = {}
    for f in GTFFeature.parse(open(gtf_file)):
        if f.feature_type != 'exon':
            continue
        genes.add(f.attrs["gene_id"])
        t_id = f.attrs["transcript_id"]
        if t_id not in transcript_dict:
            transcript_dict[t_id] = (f.seqid, f.strand)
            exon_dict[t_id] = []
        exon_dict[t_id].append((f.start, f.end))

    introns = set()
    exons = set()
    for t_id in transcript_dict:
        chrom, strand = transcript_dict[t_id]
        t_exons = exon_dict[t_id]
        t_exons.sort()
        for start, end in t_exons:
            exons.add((chrom, strand, start, end))
        for start, end in iterintrons(t_exons):
            introns.add((chrom, strand, start, end))

    logging.debug("Genes: %d" % (len(genes)))
    logging.debug("Transcripts: %d" % (len(transcript_dict)))
    logging.debug("Introns: %d" % (len(introns)))
    logging.debug("Exons: %d" % (len(exons)))
Exemple #34
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--gtf-split-attr", dest="gtf_split_attr", default="library_id")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    fhdict = {}
    gtf_split_attr = args.gtf_split_attr
    for f in GTFFeature.parse(open(args.gtf_file)):
        if gtf_split_attr not in f.attrs:
            val = "na_missing"
        else:
            val = f.attrs[args.gtf_split_attr]
        filename = "%s.gtf" % (val)
        if not os.path.exists(filename):
            fh = open(filename, "w")
            fhdict[val] = fh
        else:
            fh = fhdict[val]
        print >>fh, str(f)
    for fh in fhdict.itervalues():
        fh.close()
    return 0
def run_filter(cinfo, cutoff_dict):
    # maintain predictions in a dictionary where key is transcript id and 
    # value is boolean prediction decision for transcript.
    t_id_decisions = {}
    # maintain result objects in dictionary keyed by transcript id
    t_id_results = {}
    # maintain heap queues that keeps track of the last transcript position 
    # on each chromosome. prediction decisions only need to be remembered
    # until the parsing goes past the end of the transcript (all exons 
    # accounted for)
    decision_heapqs = collections.defaultdict(lambda: [])
    # maintain heapq that keeps track of transcript position of
    # result objects.  results only need to be remembered until parsing
    # goes past the chrom/start position of the transcript (all transcripts
    # accounted for)
    result_heapqs = collections.defaultdict(lambda: [])
    # read result file and gtf file in sync
    result_fh = open(cinfo.sorted_ctree_file)
    result_fh.next()
    gtf_fh = open(cinfo.output_gtf_file)
    # open output files
    for decision,filename in cinfo.decision_file_dict.iteritems():
        cinfo.decision_fh_dict[decision] = open(filename, "w")
    # keep track of prediction statistics
    decision_stats = collections.defaultdict(lambda: 0)
    for feature in GTFFeature.parse(gtf_fh):
        # get transcript id used to lookup expressed/background 
        # prediction decision
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        # check top of the decision heapq and pop transcripts when parsing 
        # has gone past the end
        decision_heapq = decision_heapqs[feature.seqid]               
        while (len(decision_heapq) > 0) and (feature.start > decision_heapq[0][0]):
            smallest_end, smallest_t_id = heapq.heappop(decision_heapq)
            del t_id_decisions[smallest_t_id]
        # check top of result heapq and pop transcripts when parsing has gone
        # past the end
        result_heapq = result_heapqs[feature.seqid]
        while (len(result_heapq) > 0) and (feature.start > result_heapq[0][0]):
            result_start, result_t_id = heapq.heappop(result_heapq)
            del t_id_results[result_t_id]
        # parse transcript/exon features differently
        if feature.feature_type == "transcript":
            # parse results until this t_id is found (all results 
            # must stay valid until past this chrom/start location)
            while t_id not in t_id_results:
                result = ClassificationResult.from_line(result_fh.next())
                # add to heapq to remove results that are no longer useful
                heapq.heappush(result_heapqs[result.chrom], 
                               (result.start, result.t_id))
                # add to result dictionary
                t_id_results[result.t_id] = result
                # if current result position is beyond current transcript
                # position then we know that we are missing results for this
                # transcript and need to skip it
                if ((result.chrom != feature.seqid) or 
                    (result.start > feature.start)):
                    break
            if t_id not in t_id_results:
                #logging.warning("Skipping: library_id=%s t_id=%s "
                #                "chrom=%s start=%d " %
                #                (feature.attrs[GTFAttr.LIBRARY_ID], t_id,
                #                 feature.seqid, feature.start))
                decision = SKIPPED
            else:
                # lookup classification result and ensure that transcript_id 
                # attribute matches result id
                result = t_id_results[t_id]
                # lookup cutoff value for classification
                library_id = feature.attrs[GTFAttr.LIBRARY_ID]
                cutoff = cutoff_dict[library_id]
                feature.attrs["cutoff"] = cutoff
                is_expr = (result.pred >= cutoff)
                # retain certain results as transcript attributes
                for attr_name in GTF_ATTRS_TO_RETAIN:
                    feature.attrs[attr_name] = getattr(result, attr_name)
                # keep track of prediction decision and statistics
                # remember decision in dict so that it can be 
                # applied to the transcript exons as well
                if result.annotated:
                    if is_expr:
                        decision = ANN_EXPR
                    else:
                        decision = ANN_BKGD
                else:
                    if is_expr:
                        decision = UNANN_EXPR
                    else:
                        decision = UNANN_BKGD
            # push transcript end onto decision heap queue 
            # (decision must stay valid until past the end)
            heapq.heappush(decision_heapq, (feature.end, t_id))                                
            # keep track of decision to apply it to exon features
            t_id_decisions[t_id] = decision
        else:
            decision = t_id_decisions[t_id]
        # keep track of stats
        decision_stats[decision] += 1
        # output to separate files
        out_fh = cinfo.decision_fh_dict[decision]
        print >>out_fh, str(feature)
    # cleanup
    gtf_fh.close()
    result_fh.close()    
    for fh in cinfo.decision_fh_dict.itervalues():
        fh.close()
    return decision_stats
Exemple #36
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--source", dest="source", default='bed_to_gtf')
    parser.add_argument("bed_file")
    args = parser.parse_args()
    bed_file = args.bed_file
    source = args.source
    for x in BEDFeature.parse(open(bed_file)):
        f = GTFFeature()
        f.seqid = x.chrom
        f.source = source
        f.feature_type = 'transcript'
        f.start = x.tx_start
        f.end = x.tx_end
        f.score = x.score
        f.strand = x.strand
        f.phase = '.'
        f.attrs = {'transcript_id': x.name, 'gene_id': x.name}
        features = [f]
        for i, e in enumerate(x.exons):
            start, end = e
            f = GTFFeature()
            f.seqid = x.chrom
            f.source = source
            f.feature_type = 'exon'
            f.start = start
            f.end = end
            f.score = x.score
            f.strand = x.strand
            f.phase = '.'
            f.attrs = dict(features[0].attrs)
            f.attrs["exon_number"] = i
            features.append(f)
        for f in features:
            print str(f)
def gtf_add_transcript_features(gtf_file, outfh):
    transcript_dict = collections.defaultdict(lambda: [])
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != "exon":
            continue
        t_id = feature.attrs[GTFAttr.TRANSCRIPT_ID]
        transcript_dict[t_id].append(feature)
    # output reference transcripts
    for t_id, features in transcript_dict.iteritems():
        # sort features (exons) by start position
        features.sort(key=operator.attrgetter('start'))
        # transcript feature
        f = GTFFeature()
        f.seqid = features[0].seqid
        f.source = features[0].source
        f.feature_type = 'transcript'
        f.start = features[0].start
        f.end = features[-1].end
        f.score = features[0].score
        f.strand = features[0].strand
        f.phase = '.'
        f.attrs = features[0].attrs.copy()
        if "exon_number" in f.attrs:
            del f.attrs["exon_number"]
        #f.attrs[GTFAttr.REF] = '1'
        print >>outfh, str(f)
        # annotate exons as reference features
        for f in features:
            #f.attrs[GTFAttr.REF] = '1'
            print >>outfh, str(f)
def add_gtf_file(gtf_file, outfh, is_ref):
    refval = '1' if is_ref else '0'
    for chrom, transcript_dict, exon_dict in _parse_gtf_by_chrom(gtf_file):
        logging.debug("\tfinished chrom %s %d features" %
                      (chrom, len(exon_dict)))
        # output reference transcripts
        for t_id, features in exon_dict.iteritems():
            # sort features (exons) by start position
            features.sort(key=operator.attrgetter('start'))
            # annotate exons as reference features
            for f in features:
                f.attrs[GTFAttr.REF] = refval
                print >> outfh, str(f)
            # transcript feature
            if t_id in transcript_dict:
                f = transcript_dict[t_id]
            else:
                f = GTFFeature()
                f.seqid = features[0].seqid
                f.source = features[0].source
                f.feature_type = 'transcript'
                f.start = features[0].start
                f.end = features[-1].end
                f.score = features[0].score
                f.strand = features[0].strand
                f.phase = '.'
                f.attrs = features[0].attrs.copy()
                if "exon_number" in f.attrs:
                    del f.attrs["exon_number"]
            f.attrs[GTFAttr.REF] = refval
            print >> outfh, str(f)
Exemple #39
0
 def to_gtf_features(self, source=None):
     if source is None:
         source = 'source'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 1000.0
     f.strand = self.strand
     f.phase = '.'
     f.attrs = self.attrs.copy()
     features = [f]
     # exon features
     for i, e in enumerate(self.exons):
         start, end = e
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = start
         f.end = end
         f.score = 1000.0
         f.strand = self.strand
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
Exemple #41
0
 def to_gtf_features(self, source=None):
     if source is None:
         source = 'source'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 1000.0
     f.strand = self.strand
     f.phase = '.'
     f.attrs = self.attrs.copy()
     features = [f]
     # exon features
     for i,e in enumerate(self.exons):
         start,end = e
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = start
         f.end = end
         f.score = 1000.0
         f.strand = self.strand
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = "assemblyline"
    f.feature_type = "transcript"
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = "."
    f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac}
    f.attrs.update(attr_dict)
    yield f
    for i, e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = "assemblyline"
        f.feature_type = "exon"
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = "."
        f.attrs = {"exon_number": i + 1}
        f.attrs.update(attr_dict)
        yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
def get_cds_features(gtf_file):
    cds = collections.defaultdict(lambda: set())
    i = 0
    for f in GTFFeature.parse(open(gtf_file)):
        if f.feature_type == "CDS":
            cds[f.seqid].add((f.start, f.end, f.strand))
        i += 1
        if (i % 100000) == 0:
            logging.debug("Parsed %d features" % (i))
    logging.debug("Returning CDS transcripts")
    t_id = 1
    for chrom in sorted(cds):
        for start, end, strand in sorted(cds[chrom]):
            for feature_type in ('transcript', 'exon'):
                f = GTFFeature()
                f.seqid = chrom
                f.source = 'cds'
                f.feature_type = feature_type
                f.start = start
                f.end = end
                f.score = 0
                f.strand = strand
                f.phase = '.'
                f.attrs = {'cds': 1, 'transcript_id': 'CDS%08d' % (t_id)}
                yield f
            t_id += 1
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        dest="verbose",
                        default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >> outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >> outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [
                locus_chrom,
                str(start),
                str(end),
                '%s|%s|%s' % (m, t, c), '0',
                strand_int_to_str(strand)
            ]
            print '\t'.join(fields)
    return 0
'''
Created on Feb 13, 2013

@author: mkiyer
'''
import argparse

from assemblyline.lib.gtf import GTFFeature

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    lncrna_biotypes = set(['3prime_overlapping_ncrna',
                           'antisense', 
                           'lincRNA',
                           'sense_intronic',
                           'sense_overlapping'])
    transcript_ids = set()
    for feature in GTFFeature.parse(open(args.gtf_file)):
        if feature.feature_type != "exon":
            continue
        biotype = feature.attrs["gene_biotype"]
        if biotype in lncrna_biotypes:
            transcript_ids.add(feature.attrs['transcript_id'])
    for t_id in sorted(transcript_ids):
        print t_id
Exemple #47
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--source", dest="source", default='bed_to_gtf')
    parser.add_argument("bed_file")
    args = parser.parse_args()
    bed_file = args.bed_file
    source = args.source
    for x in BEDFeature.parse(open(bed_file)):
        f = GTFFeature()
        f.seqid = x.chrom
        f.source = source
        f.feature_type = 'transcript'
        f.start = x.tx_start
        f.end = x.tx_end
        f.score = x.score
        f.strand = x.strand
        f.phase = '.'
        f.attrs = {'transcript_id': x.name,
                   'gene_id': x.name}
        features = [f]
        for i,e in enumerate(x.exons):
            start, end = e
            f = GTFFeature()
            f.seqid = x.chrom
            f.source = source
            f.feature_type = 'exon'
            f.start = start
            f.end = end
            f.score = x.score
            f.strand = x.strand
            f.phase = '.'
            f.attrs = dict(features[0].attrs)
            f.attrs["exon_number"] = i
            features.append(f)
        for f in features:
            print str(f)
def make_transcript_feature(exon_features):
    f = GTFFeature()
    f.seqid = exon_features[0].seqid
    f.source = exon_features[0].source
    f.feature_type = 'transcript'
    f.start = exon_features[0].start
    f.end = exon_features[-1].end
    f.score = exon_features[0].score
    f.strand = exon_features[0].strand
    f.phase = '.'
    f.attrs = exon_features[0].attrs.copy()
    if "exon_number" in f.attrs:
        del f.attrs["exon_number"]
    return f
def get_cds_features(gtf_file):
    cds = collections.defaultdict(lambda: set())
    i = 0
    for f in GTFFeature.parse(open(gtf_file)):
        if f.feature_type == "CDS":
            cds[f.seqid].add((f.start, f.end, f.strand))
        i += 1
        if (i % 100000) == 0:
            logging.debug("Parsed %d features" % (i))
    logging.debug("Returning CDS transcripts")
    t_id = 1
    for chrom in sorted(cds):
        for start,end,strand in sorted(cds[chrom]):
            for feature_type in ('transcript', 'exon'):
                f = GTFFeature()
                f.seqid = chrom
                f.source = 'cds'
                f.feature_type = feature_type
                f.start = start
                f.end = end
                f.score = 0
                f.strand = strand
                f.phase = '.'
                f.attrs = {'cds': 1, 
                           'transcript_id': 'CDS%08d' % (t_id)}
                yield f
            t_id += 1
Exemple #50
0
'''
Created on Feb 23, 2013

@author: mkiyer
'''
import argparse

from assemblyline.lib.gtf import GTFFeature

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    transcript_ids = set()
    for feature in GTFFeature.parse(open(args.gtf_file)):
        if feature.feature_type != "exon":
            continue
        transcript_ids.add(feature.attrs['transcript_id'])
    for t_id in sorted(transcript_ids):
        print t_id