Ejemplo n.º 1
0
 def to_gtf(self):
     strand_str = Strand.to_gtf(self.strand)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 0.0
     f.strand = strand_str
     f.phase = '.'
     f.attrs = {
         GTF.Attr.TRANSCRIPT_ID: self._id,
         GTF.Attr.SAMPLE_ID: self.sample_id,
         GTF.Attr.EXPR: str(self.expr),
         GTF.Attr.REF: str(int(self.is_ref))
     }
     yield f
     for e in self.exons:
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = 0.0
         f.strand = strand_str
         f.phase = '.'
         f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id}
         yield f
Ejemplo n.º 2
0
 def get_change_point_gtf(self, cp):
     graph_id = ('G_%s_%d_%d_%s' %
                 (self.chrom, self.start, self.end,
                  Strand.to_gtf(self.strand)))
     features = []
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changept'
     f.start = cp.pos
     f.end = cp.pos + 1
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changeinterval'
     f.start = cp.start
     f.end = cp.end
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     return features
Ejemplo n.º 3
0
 def get_node_gtf(self):
     graph_id = (
         'G_%s_%d_%d_%s' %
         (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
     # iterate through locus and return change point data
     for n_id in self.G:
         n = self.get_node_interval(n_id)
         expr_data = self.get_expr_data(*n)
         ref_starts = _array_subset(self.ref_start_sites, *n)
         ref_stops = _array_subset(self.ref_stop_sites, *n)
         # return gtf feature for each node
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'node'
         f.start = n[0]
         f.end = n[1]
         f.score = 0
         f.strand = Strand.to_gtf(self.strand)
         f.phase = '.'
         f.attrs = {
             'graph_id': graph_id,
             'expr_min': str(expr_data.min()),
             'expr_max': str(expr_data.max()),
             'expr_mean': str(expr_data.mean()),
             'ref_starts': ','.join(map(str, ref_starts)),
             'ref_stops': ','.join(map(str, ref_stops))
         }
         yield f
Ejemplo n.º 4
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {
        'locus_id': locus_id,
        'gene_id': gene_id,
        'tss_id': tss_id,
        'transcript_id': transcript_id
    }
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {
        'expr': '%.3f' % expr,
        'rel_frac': '%.5f' % rel_frac,
        'abs_frac': '%.5f' % abs_frac
    }
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f
Ejemplo n.º 5
0
def _make_transcript_feature(exon_features):
    f = GTF.Feature()
    f.seqid = exon_features[0].seqid
    f.source = exon_features[0].source
    f.feature = 'transcript'
    f.start = exon_features[0].start
    f.end = exon_features[-1].end
    f.score = exon_features[0].score
    f.strand = exon_features[0].strand
    f.phase = '.'
    f.attrs = exon_features[0].attrs.copy()
    if 'exon_number' in f.attrs:
        del f.attrs['exon_number']
    return f
Ejemplo n.º 6
0
def assemble_isoforms(sgraph, config):
    # create a path graph from the splice graph
    K, k = create_optimal_path_graph(
        sgraph,
        kmax=config.path_graph_kmax,
        loss_threshold=config.path_graph_loss_threshold,
        stats_fh=config.path_graph_stats_fh)
    if K is None:
        return []
    if len(K) == 0:
        return []

    # report lost nodes
    if config.assembly_loss_gtf_fh is not None:
        graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                        Strand.to_gtf(sgraph.strand)))
        for n_id in get_lost_nodes(sgraph, K):
            n = sgraph.get_node_interval(n_id)
            expr_data = sgraph.get_node_expr_data(n_id)
            # return gtf feature for each node
            f = GTF.Feature()
            f.seqid = sgraph.chrom
            f.source = 'taco'
            f.feature = 'lost_node'
            f.start = n[0]
            f.end = n[1]
            f.score = 0.0
            f.strand = Strand.to_gtf(sgraph.strand)
            f.phase = '.'
            f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())}
            print >> config.assembly_loss_gtf_fh, str(f)

    # smooth kmer graph
    smooth_graph(K)

    source_node = K.graph['source']
    source_expr = K.node[source_node][KMER_EXPR]
    logging.debug('%s:%d-%d[%s] finding paths in k=%d graph '
                  '(%d nodes) source_expr=%f' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), k, len(K), source_expr))
    id_kmer_map = K.graph['id_kmer_map']

    paths = []
    for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac,
                                      config.max_paths):
        path = reconstruct_path(kmer_path, id_kmer_map, sgraph)
        logging.debug("\texpr=%f length=%d" % (expr, len(path)))
        paths.append((path, expr))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('\tclusters: %d filtered: %d' %
                  (len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms