def from_gtf(f): '''GTF.Feature object to Transfrag''' return Transfrag(chrom=f.seqid, strand=Strand.from_gtf(f.strand), _id=f.attrs[GTF.Attr.TRANSCRIPT_ID], sample_id=f.attrs.get(GTF.Attr.SAMPLE_ID, None), expr=float(f.attrs.get(GTF.Attr.EXPR, 0.0)), is_ref=bool(int(f.attrs.get(GTF.Attr.REF, '0'))), exons=None)
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref): ''' returns list of Transfrag objects ''' t_dict = collections.OrderedDict() total_expr = 0.0 cur_t_id = 1 for gtf_line in gtf_iter: if not gtf_line: continue if not gtf_line.strip(): continue if gtf_line.startswith("#"): continue f = GTF.Feature.from_str(gtf_line) if f.feature == 'transcript': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id in t_dict: raise GTFError("Transcript '%s' duplicate detected" % t_id) # rename transcript id new_t_id = "%s.%d" % (sample_id, cur_t_id) cur_t_id += 1 # parse expression if is_ref: expr = 0.0 else: if gtf_expr_attr not in f.attrs: raise GTFError("GTF expression attribute '%s' not found" % (gtf_expr_attr)) expr = float(f.attrs[gtf_expr_attr]) total_expr += expr # create transfrag t = Transfrag(chrom=f.seqid, strand=Strand.from_gtf(f.strand), _id=new_t_id, expr=float(expr), is_ref=is_ref, exons=None) t_dict[t_id] = t elif f.feature == 'exon': t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID] if t_id not in t_dict: logging.error('Feature: "%s"' % str(f)) raise GTFError("Transcript '%s' exon feature appeared in " "gtf file prior to transcript feature" % t_id) t = t_dict[t_id] t.exons.append(Exon(f.start, f.end)) return t_dict.values(), total_expr