Example #1
0
def transcripts_from_gtf_lines(lines, attr_defs=None):
    transcripts = collections.OrderedDict()
    for feature in GTFFeature.parse(lines, attr_defs):
        # feature = GTFFeature.parse(line, attr_defs)
        # skip gene annotation in gtf files
        if "transcript_id" not in feature.attrs:
            continue
        t_id = feature.attrs["transcript_id"]
        if t_id not in transcripts:
            if feature.feature_type != "transcript":
                raise GTFError(
                    "Feature type '%s' found before 'transcript' record: %s" %
                    (feature.feature_type, str(feature)))
            t = Transcript()
            t.chrom = feature.seqid
            t.start = feature.start
            t.end = feature.end
            # convert from string strand notation ("+", "-", ".")
            # to integer (0, 1)
            t.strand = strand_str_to_int(feature.strand)
            t.exons = []
            t.attrs = feature.attrs
            transcripts[t_id] = t
        else:
            t = transcripts[t_id]
        if feature.feature_type == "exon":
            t.exons.append(Exon(feature.start, feature.end))
    # sort transcript exons by genomic position
    for t in transcripts.itervalues():
        t.exons.sort()
    return transcripts.values()
Example #2
0
def to_formatted_gtf(lines, gtf_file, attr_defs=None):
    transcripts = collections.OrderedDict()
    for line in lines:
        feature = GTFFeature.from_string(line, attr_defs)
        # skip gene annotation in gtf files
        if "transcript_id" not in feature.attrs:
            continue
        t_id = feature.attrs["transcript_id"]
        # extract exon information
        if t_id not in transcripts:
            if feature.feature_type == "exon":
                t = Transcript()
                t.chrom = feature.seqid
                t.start = feature.start
                t.end = feature.end
                t.strand = strand_str_to_int(feature.strand)
                t.exons = [Exon(feature.start, feature.end)]
                t.attrs = dict()
                for each_attr in feature.attrs:
                    if 'exon' not in each_attr.lower():
                        t.attrs.update({each_attr: feature.attrs[each_attr]})
                transcripts[t_id] = t
        else:
            t.start = t.start if t.start <= feature.start else feature.start
            t.end = t.end if t.end >= feature.end else feature.end
            t.exons.append(Exon(feature.start, feature.end))
    with open(gtf_file, 'w') as gtf_output:
        for each_tr in transcripts:
            each_tr_obj = transcripts[each_tr]
            for each_feature in each_tr_obj.to_gtf_features():
                gtf_output.write(
                    '{gtf_line}\n'.format(gtf_line=str(each_feature)))
Example #3
0
def transcripts_from_gtf_lines(lines, attr_defs=None):
    transcripts = collections.OrderedDict()
    for line in lines:
        feature = GTFFeature.from_string(line, attr_defs)
        t_id = feature.attrs["transcript_id"]
        if t_id not in transcripts:            
            if feature.feature_type != "transcript":
                raise GTFError("Feature type '%s' found before 'transcript' record: %s" % 
                               (feature.feature_type, str(feature)))
            t = Transcript()
            t.chrom = feature.seqid
            t.start = feature.start
            t.end = feature.end
            # convert from string strand notation ("+", "-", ".") 
            # to integer (0, 1)
            t.strand = strand_str_to_int(feature.strand)
            t.exons = []
            t.attrs = feature.attrs
            transcripts[t_id] = t
        else:
            t = transcripts[t_id]
        if feature.feature_type == "exon":
            t.exons.append(Exon(feature.start, feature.end))
    # sort transcript exons by genomic position
    for t in transcripts.itervalues():
        t.exons.sort()
    return transcripts.values()
Example #4
0
 def to_gtf_features(self, source=None, score=1000):
     if source is None:
         source = 'assemblyline'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = score
     f.strand = strand_int_to_str(self.strand)
     f.phase = '.'
     f.attrs = self.attrs
     features = [f]
     # exon features
     for i, e in enumerate(self.exons):
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = score
         f.strand = strand_int_to_str(self.strand)
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features
Example #5
0
 def to_gtf_features(self, source=None, score=1000):
     if source is None:
         source = 'assemblyline'
     # transcript feature
     f = GTFFeature()
     f.seqid = self.chrom
     f.source = source
     f.feature_type = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = score
     f.strand = strand_int_to_str(self.strand)
     f.phase = '.'
     f.attrs = self.attrs
     features = [f]
     # exon features
     for i,e in enumerate(self.exons):
         f = GTFFeature()
         f.seqid = self.chrom
         f.source = source
         f.feature_type = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = score
         f.strand = strand_int_to_str(self.strand)
         f.phase = '.'
         f.attrs = self.attrs.copy()
         f.attrs["exon_number"] = i
         features.append(f)
     return features