def transcripts_from_gtf_lines(lines, attr_defs=None): transcripts = collections.OrderedDict() for feature in GTFFeature.parse(lines, attr_defs): # feature = GTFFeature.parse(line, attr_defs) # skip gene annotation in gtf files if "transcript_id" not in feature.attrs: continue t_id = feature.attrs["transcript_id"] if t_id not in transcripts: if feature.feature_type != "transcript": raise GTFError( "Feature type '%s' found before 'transcript' record: %s" % (feature.feature_type, str(feature))) t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end # convert from string strand notation ("+", "-", ".") # to integer (0, 1) t.strand = strand_str_to_int(feature.strand) t.exons = [] t.attrs = feature.attrs transcripts[t_id] = t else: t = transcripts[t_id] if feature.feature_type == "exon": t.exons.append(Exon(feature.start, feature.end)) # sort transcript exons by genomic position for t in transcripts.itervalues(): t.exons.sort() return transcripts.values()
def to_formatted_gtf(lines, gtf_file, attr_defs=None): transcripts = collections.OrderedDict() for line in lines: feature = GTFFeature.from_string(line, attr_defs) # skip gene annotation in gtf files if "transcript_id" not in feature.attrs: continue t_id = feature.attrs["transcript_id"] # extract exon information if t_id not in transcripts: if feature.feature_type == "exon": t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end t.strand = strand_str_to_int(feature.strand) t.exons = [Exon(feature.start, feature.end)] t.attrs = dict() for each_attr in feature.attrs: if 'exon' not in each_attr.lower(): t.attrs.update({each_attr: feature.attrs[each_attr]}) transcripts[t_id] = t else: t.start = t.start if t.start <= feature.start else feature.start t.end = t.end if t.end >= feature.end else feature.end t.exons.append(Exon(feature.start, feature.end)) with open(gtf_file, 'w') as gtf_output: for each_tr in transcripts: each_tr_obj = transcripts[each_tr] for each_feature in each_tr_obj.to_gtf_features(): gtf_output.write( '{gtf_line}\n'.format(gtf_line=str(each_feature)))
def transcripts_from_gtf_lines(lines, attr_defs=None): transcripts = collections.OrderedDict() for line in lines: feature = GTFFeature.from_string(line, attr_defs) t_id = feature.attrs["transcript_id"] if t_id not in transcripts: if feature.feature_type != "transcript": raise GTFError("Feature type '%s' found before 'transcript' record: %s" % (feature.feature_type, str(feature))) t = Transcript() t.chrom = feature.seqid t.start = feature.start t.end = feature.end # convert from string strand notation ("+", "-", ".") # to integer (0, 1) t.strand = strand_str_to_int(feature.strand) t.exons = [] t.attrs = feature.attrs transcripts[t_id] = t else: t = transcripts[t_id] if feature.feature_type == "exon": t.exons.append(Exon(feature.start, feature.end)) # sort transcript exons by genomic position for t in transcripts.itervalues(): t.exons.sort() return transcripts.values()
def to_gtf_features(self, source=None, score=1000): if source is None: source = 'assemblyline' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = score f.strand = strand_int_to_str(self.strand) f.phase = '.' f.attrs = self.attrs features = [f] # exon features for i, e in enumerate(self.exons): f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = e.start f.end = e.end f.score = score f.strand = strand_int_to_str(self.strand) f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features
def to_gtf_features(self, source=None, score=1000): if source is None: source = 'assemblyline' # transcript feature f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'transcript' f.start = self.start f.end = self.end f.score = score f.strand = strand_int_to_str(self.strand) f.phase = '.' f.attrs = self.attrs features = [f] # exon features for i,e in enumerate(self.exons): f = GTFFeature() f.seqid = self.chrom f.source = source f.feature_type = 'exon' f.start = e.start f.end = e.end f.score = score f.strand = strand_int_to_str(self.strand) f.phase = '.' f.attrs = self.attrs.copy() f.attrs["exon_number"] = i features.append(f) return features