def split_gtf_file(gtf_file,
                   split_dir,
                   ref_gtf_file,
                   category_stats_file,
                   bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, 'w')
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == 'transcript':
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, '\t'.join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, '\t'.join(map(str, fields))
    fh.close()
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, "w")
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == "transcript":
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, "\t".join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, "\t".join(map(str, fields))
    fh.close()
Beispiel #3
0
def annotate_gtf(gtf_file, bed_dbs):
    # read reference databases
    bed_trees = []
    for name,filename in bed_dbs:
        logging.debug("Loading BED db '%s' file '%s'" % (name,filename))
        trees = build_interval_tree_from_bed(filename)
        bed_trees.append((name, trees))
    # parse gtf file and annotate
    logging.debug("Annotating GTF")
    for lines in parse_loci(open(gtf_file)):
        features = []
        transcripts = []
        transcript_matches = collections.defaultdict(lambda: collections.defaultdict(lambda: set()))
        for line in lines:
            f = GTFFeature.from_string(line)
            features.append(f)
            t_id = f.attrs['transcript_id']
            if f.feature_type == 'transcript':
                transcripts.append(f)
            elif f.feature_type == 'exon':
                for dbname,dbtrees in bed_trees:
                    # intersect this exon with features
                    hits = dbtrees[f.seqid].find(f.start, f.end)                        
                    matches = set(hit.value for hit in hits if hit.strand == f.strand)
                    f.attrs[dbname] = ','.join(sorted(matches))
                    # update transcript level matches
                    transcript_matches[t_id][dbname].update(matches)
        # set transcript annotations
        for f in transcripts:
            t_id = f.attrs['transcript_id']
            for dbname,dbtrees in bed_trees:
                matches = transcript_matches[t_id][dbname]
                f.attrs[dbname] = ','.join(sorted(matches))
        # write features
        for f in features:
            print str(f)
    logging.debug("Done")