Example #1
0
 def get_node_gtf(self):
     graph_id = (
         'G_%s_%d_%d_%s' %
         (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
     # iterate through locus and return change point data
     for n_id in self.G:
         n = self.get_node_interval(n_id)
         expr_data = self.get_expr_data(*n)
         ref_starts = _array_subset(self.ref_start_sites, *n)
         ref_stops = _array_subset(self.ref_stop_sites, *n)
         # return gtf feature for each node
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'node'
         f.start = n[0]
         f.end = n[1]
         f.score = 0
         f.strand = Strand.to_gtf(self.strand)
         f.phase = '.'
         f.attrs = {
             'graph_id': graph_id,
             'expr_min': str(expr_data.min()),
             'expr_max': str(expr_data.max()),
             'expr_mean': str(expr_data.mean()),
             'ref_starts': ','.join(map(str, ref_starts)),
             'ref_stops': ','.join(map(str, ref_stops))
         }
         yield f
Example #2
0
 def get_change_point_gtf(self, cp):
     graph_id = ('G_%s_%d_%d_%s' %
                 (self.chrom, self.start, self.end,
                  Strand.to_gtf(self.strand)))
     features = []
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changept'
     f.start = cp.pos
     f.end = cp.pos + 1
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changeinterval'
     f.start = cp.start
     f.end = cp.end
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     return features
Example #3
0
 def get_change_point_gtf(self, cp):
     graph_id = ('G_%s_%d_%d_%s' %
                 (self.chrom, self.start, self.end,
                  Strand.to_gtf(self.strand)))
     features = []
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changept'
     f.start = cp.pos
     f.end = cp.pos + 1
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'changeinterval'
     f.start = cp.start
     f.end = cp.end
     f.score = 0
     f.strand = Strand.to_gtf(self.strand)
     f.phase = '.'
     f.attrs = {'graph_id': graph_id,
                'sign': str(cp.sign),
                'pvalue': str(cp.pvalue),
                'foldchange': str(cp.foldchange)}
     features.append(f)
     return features
Example #4
0
 def get_node_gtf(self):
     graph_id = ('G_%s_%d_%d_%s' %
                 (self.chrom, self.start, self.end,
                  Strand.to_gtf(self.strand)))
     # iterate through locus and return change point data
     for n_id in self.G.node_ids_iter():
         n = self.get_node_interval(n_id)
         expr_data = self.get_expr_data(*n)
         ref_starts = _array_subset(self.ref_start_sites, *n)
         ref_stops = _array_subset(self.ref_stop_sites, *n)
         # return gtf feature for each node
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'node'
         f.start = n[0]
         f.end = n[1]
         f.score = 0
         f.strand = Strand.to_gtf(self.strand)
         f.phase = '.'
         f.attrs = {'graph_id': graph_id,
                    'expr_min': str(expr_data.min()),
                    'expr_max': str(expr_data.max()),
                    'expr_mean': str(expr_data.mean()),
                    'ref_starts': ','.join(map(str, ref_starts)),
                    'ref_stops': ','.join(map(str, ref_stops))}
         yield f
Example #5
0
def assemble_gene(sgraph, locus_id_str, config):
    logging.debug('%s:%d-%d[%s] nodes=%d' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), len(sgraph.G)))
    # output splice graph node data
    for f in sgraph.get_node_gtf():
        print >> config.splice_graph_gtf_fh, str(f)

    if config.change_point:
        # detect change points
        changepts = sgraph.detect_change_points(
            pval=config.change_point_pvalue,
            fc_cutoff=config.change_point_fold_change)
        logging.debug('%s:%d-%d[%s] change points: %d' %
                      (sgraph.chrom, sgraph.start, sgraph.end,
                       Strand.to_gtf(sgraph.strand), len(changepts)))
        for cp in changepts:
            sgraph.apply_change_point(cp, config.change_point_trim)
            # output splice graph change points
            for f in sgraph.get_change_point_gtf(cp):
                print >> config.splice_graph_gtf_fh, str(f)
        # must recreate splice graph after finding change points
        if len(changepts) > 0:
            sgraph.recreate()

    # run isoform path finding algorithm, filter and group into genes
    for gene_isoforms in assemble_isoforms(sgraph, config):
        # assign gene_id and tss_id
        assign_ids(gene_isoforms, sgraph.strand, config.gene_id_value_obj,
                   config.tss_id_value_obj)
        # write output
        for isoform in gene_isoforms:
            # assign transcript id
            t_id = config.t_id_value_obj.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (isoform.tss_id)
            gene_id_str = "G%d" % (isoform.gene_id)
            # write to GTF
            for f in get_gtf_features(chrom=sgraph.chrom,
                                      strand=sgraph.strand,
                                      exons=isoform.path,
                                      locus_id=locus_id_str,
                                      gene_id=gene_id_str,
                                      tss_id=tss_id_str,
                                      transcript_id=t_id_str,
                                      expr=isoform.expr,
                                      rel_frac=isoform.rel_frac,
                                      abs_frac=isoform.abs_frac):
                print >> config.assembly_gtf_fh, str(f)
            # write to BED
            name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr)
            fields = write_bed(sgraph.chrom, name, sgraph.strand,
                               int(round(1000.0 * isoform.rel_frac)),
                               isoform.path)
            print >> config.assembly_bed_fh, '\t'.join(fields)
Example #6
0
def assemble_gene(sgraph, locus_id_str, config):
    logging.debug('%s:%d-%d[%s] nodes=%d' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), len(sgraph.G)))
    # output splice graph node data
    for f in sgraph.get_node_gtf():
        print >>config.splice_graph_gtf_fh, str(f)

    if config.change_point:
        # detect change points
        changepts = sgraph.detect_change_points(
            pval=config.change_point_pvalue,
            fc_cutoff=config.change_point_fold_change)
        logging.debug('%s:%d-%d[%s] change points: %d' %
                      (sgraph.chrom, sgraph.start, sgraph.end,
                       Strand.to_gtf(sgraph.strand), len(changepts)))
        for cp in changepts:
            sgraph.apply_change_point(cp, config.change_point_trim)
            # output splice graph change points
            for f in sgraph.get_change_point_gtf(cp):
                print >>config.splice_graph_gtf_fh, str(f)
        # must recreate splice graph after finding change points
        if len(changepts) > 0:
            sgraph.recreate()

    # run isoform path finding algorithm, filter and group into genes
    for gene_isoforms in assemble_isoforms(sgraph, config):
        # assign gene_id and tss_id
        assign_ids(gene_isoforms, sgraph.strand, config.gene_id_value_obj,
                   config.tss_id_value_obj)
        # write output
        for isoform in gene_isoforms:
            # assign transcript id
            t_id = config.t_id_value_obj.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (isoform.tss_id)
            gene_id_str = "G%d" % (isoform.gene_id)
            # write to GTF
            for f in get_gtf_features(chrom=sgraph.chrom,
                                      strand=sgraph.strand,
                                      exons=isoform.path,
                                      locus_id=locus_id_str,
                                      gene_id=gene_id_str,
                                      tss_id=tss_id_str,
                                      transcript_id=t_id_str,
                                      expr=isoform.expr,
                                      rel_frac=isoform.rel_frac,
                                      abs_frac=isoform.abs_frac):
                print >>config.assembly_gtf_fh, str(f)
            # write to BED
            name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr)
            fields = write_bed(sgraph.chrom, name, sgraph.strand,
                               int(round(1000.0 * isoform.rel_frac)),
                               isoform.path)
            print >>config.assembly_bed_fh, '\t'.join(fields)
Example #7
0
 def to_gtf(self):
     strand_str = Strand.to_gtf(self.strand)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 0.0
     f.strand = strand_str
     f.phase = '.'
     f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id,
                GTF.Attr.SAMPLE_ID: self.sample_id,
                GTF.Attr.EXPR: str(self.expr),
                GTF.Attr.REF: str(int(self.is_ref))}
     yield f
     for e in self.exons:
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = 0.0
         f.strand = strand_str
         f.phase = '.'
         f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id}
         yield f
Example #8
0
 def to_gtf(self):
     strand_str = Strand.to_gtf(self.strand)
     f = GTF.Feature()
     f.seqid = self.chrom
     f.source = 'taco'
     f.feature = 'transcript'
     f.start = self.start
     f.end = self.end
     f.score = 0.0
     f.strand = strand_str
     f.phase = '.'
     f.attrs = {
         GTF.Attr.TRANSCRIPT_ID: self._id,
         GTF.Attr.SAMPLE_ID: self.sample_id,
         GTF.Attr.EXPR: str(self.expr),
         GTF.Attr.REF: str(int(self.is_ref))
     }
     yield f
     for e in self.exons:
         f = GTF.Feature()
         f.seqid = self.chrom
         f.source = 'taco'
         f.feature = 'exon'
         f.start = e.start
         f.end = e.end
         f.score = 0.0
         f.strand = strand_str
         f.phase = '.'
         f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id}
         yield f
Example #9
0
def create_optimal_path_graph(sgraph,
                              kmax=0,
                              loss_threshold=0.10,
                              stats_fh=None):
    '''
    create a path graph from the original splice graph using paths of length
    'k' for assembly. The parameter 'k' will be chosen by maximizing the
    number of reachable k-mers in the path graph while tolerating at most
    'loss_threshold' percent of expression.
    '''
    # find upper bound to k
    user_kmax = kmax
    kmax = find_longest_path(sgraph)
    if user_kmax > 0:
        # user can force a specific kmax (for debugging/testing purposes)
        kmax = min(user_kmax, kmax)
    sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                      Strand.to_gtf(sgraph.strand))
    tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G)

    def compute_kmers(k):
        K = create_path_graph(sgraph, k)
        valid = K.graph['valid']
        short_transfrags = K.graph['short_transfrags']
        num_lost_kmers = K.graph['num_lost_kmers']
        lost_nodes = get_lost_nodes(sgraph, K)
        lost_expr = sum(
            sgraph.get_node_expr_data(n).mean() for n in lost_nodes)
        lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
        logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d '
                      'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f '
                      'lost_expr_frac=%.3f valid=%d' %
                      (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K),
                       len(short_transfrags), num_lost_kmers, tot_expr,
                       lost_expr, lost_expr_frac, int(valid)))
        if stats_fh:
            fields = [
                sgraph_id_str, k, kmax,
                len(sgraph.transfrags),
                len(K),
                len(short_transfrags), num_lost_kmers, tot_expr, lost_expr,
                lost_expr_frac,
                int(valid)
            ]
            print >> stats_fh, '\t'.join(map(str, fields))
        if not valid:
            return -k
        #if lost_expr_frac > loss_threshold:
        #    return -k
        return len(K)

    k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
    logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers))
    K = create_path_graph(sgraph, k)
    logging.debug('Rescuing short transfrags')
    rescue_short_transfrags_saindex(K)
    return K, k
Example #10
0
 def from_gtf(f):
     '''GTF.Feature object to Transfrag'''
     return Transfrag(chrom=f.seqid,
                      strand=Strand.from_gtf(f.strand),
                      _id=f.attrs[GTF.Attr.TRANSCRIPT_ID],
                      sample_id=f.attrs.get(GTF.Attr.SAMPLE_ID, None),
                      expr=float(f.attrs.get(GTF.Attr.EXPR, 0.0)),
                      is_ref=bool(int(f.attrs.get(GTF.Attr.REF, '0'))),
                      exons=None)
Example #11
0
 def from_gtf(f):
     '''GTF.Feature object to Transfrag'''
     return Transfrag(chrom=f.seqid,
                      strand=Strand.from_gtf(f.strand),
                      _id=f.attrs[GTF.Attr.TRANSCRIPT_ID],
                      sample_id=f.attrs.get(GTF.Attr.SAMPLE_ID, None),
                      expr=float(f.attrs.get(GTF.Attr.EXPR, 0.0)),
                      is_ref=bool(int(f.attrs.get(GTF.Attr.REF, '0'))),
                      exons=None)
Example #12
0
 def get_stats(self, K, kmax=None, lost_short=0, lost_short_expr=0.0, is_opt=0):
     if kmax is None:
         kmax = self.longest_path_length
     expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr
     opt = int(round(expr_frac * len(K)))
     fields = [self.chrom, self.start, self.end, Strand.to_gtf(self.strand),
               K.k, kmax, len(self.paths), len(K.short_transfrags),
               K.short_expr, lost_short, lost_short_expr,
               len(K), K.num_lost_kmers, self.total_expr, K.graph_expr,
               expr_frac, int(K.valid), opt, is_opt]
     return fields
Example #13
0
    def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph while tolerating at most 'loss_threshold' percent of
        expression.
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length()
        if user_kmax > 0:
            # user can force a specific kmax (for debugging/testing purposes)
            kmax = min(user_kmax, kmax)
        id_str = (
            '%s:%d-%d[%s]' %
            (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))

        def compute_kmers(k):
            K = self.create(k)
            tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
            lost_expr = K.lost_kmer_expr
            lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
            if stats_fh:
                fields = [
                    self.chrom, self.start, self.end,
                    Strand.to_gtf(self.strand), k, kmax,
                    len(self.paths), K.n,
                    len(K.short_transfrags), K.num_lost_kmers, tot_expr,
                    lost_expr, lost_expr_frac,
                    int(K.valid)
                ]
                print >> stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            #if lost_expr_frac > loss_threshold:
            #    return -k
            return len(K)

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (id_str, k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (id_str, len(K)))
        num_lost = self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (id_str, num_lost, len(K.short_transfrags)))
        return K, k
Example #14
0
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref):
    '''
    returns list of Transfrag objects
    '''
    t_dict = collections.OrderedDict()
    total_expr = 0.0
    cur_t_id = 1
    for gtf_line in gtf_iter:
        if not gtf_line:
            continue
        if not gtf_line.strip():
            continue
        if gtf_line.startswith("#"):
            continue
        f = GTF.Feature.from_str(gtf_line)
        if f.feature == 'transcript':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id in t_dict:
                raise GTFError("Transcript '%s' duplicate detected" % t_id)
            # rename transcript id
            new_t_id = "%s.%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            # parse expression
            if is_ref:
                expr = 0.0
            else:
                if gtf_expr_attr not in f.attrs:
                    raise GTFError("GTF expression attribute '%s' not found" %
                                   (gtf_expr_attr))
                expr = float(f.attrs[gtf_expr_attr])
                total_expr += expr
            # create transfrag
            t = Transfrag(chrom=f.seqid,
                          strand=Strand.from_gtf(f.strand),
                          _id=new_t_id,
                          expr=float(expr),
                          is_ref=is_ref,
                          exons=None)
            t_dict[t_id] = t
        elif f.feature == 'exon':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id not in t_dict:
                logging.error('Feature: "%s"' % str(f))
                raise GTFError("Transcript '%s' exon feature appeared in "
                               "gtf file prior to transcript feature" %
                               t_id)
            t = t_dict[t_id]
            t.exons.append(Exon(f.start, f.end))
    return t_dict.values(), total_expr
Example #15
0
def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10,
                              stats_fh=None):
    '''
    create a path graph from the original splice graph using paths of length
    'k' for assembly. The parameter 'k' will be chosen by maximizing the
    number of reachable k-mers in the path graph while tolerating at most
    'loss_threshold' percent of expression.
    '''
    # find upper bound to k
    user_kmax = kmax
    kmax = find_longest_path(sgraph)
    if user_kmax > 0:
        # user can force a specific kmax (for debugging/testing purposes)
        kmax = min(user_kmax, kmax)
    sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                      Strand.to_gtf(sgraph.strand))
    tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G)

    def compute_kmers(k):
        K = create_path_graph(sgraph, k)
        valid = K.graph['valid']
        short_transfrags = K.graph['short_transfrags']
        num_lost_kmers = K.graph['num_lost_kmers']
        lost_nodes = get_lost_nodes(sgraph, K)
        lost_expr = sum(sgraph.get_node_expr_data(n).mean() for n in lost_nodes)
        lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
        logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d '
                      'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f '
                      'lost_expr_frac=%.3f valid=%d' %
                      (sgraph_id_str, k, kmax, len(sgraph.transfrags),
                       len(K), len(short_transfrags), num_lost_kmers,
                       tot_expr, lost_expr, lost_expr_frac, int(valid)))
        if stats_fh:
            fields = [sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K),
                      len(short_transfrags), num_lost_kmers, tot_expr,
                      lost_expr, lost_expr_frac, int(valid)]
            print >>stats_fh, '\t'.join(map(str, fields))
        if not valid:
            return -k
        #if lost_expr_frac > loss_threshold:
        #    return -k
        return len(K)

    k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
    logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers))
    K = create_path_graph(sgraph, k)
    logging.debug('Rescuing short transfrags')
    rescue_short_transfrags_saindex(K)
    return K, k
Example #16
0
 def write_splice_bed(self, fh):
     intron_dict = collections.defaultdict(float)
     for strand in Strand.POS, Strand.NEG:
         for t in self.strand_transfrags[strand]:
             if t.is_ref:
                 continue
             for start, end in t.iterintrons():
                 intron_dict[(start, end, strand)] += t.expr
     for intron, expr in intron_dict.iteritems():
         start, end, strand = intron
         fields = [self.chrom, str(start - 1), str(end + 1), 'JUNC',
                   str(expr), Strand.to_gtf(strand),
                   str(start - 1), str(end + 1), '255,0,0',
                   '2', '1,1', '0,%d' % (end + 1 - start)]
         print >>fh, '\t'.join(fields)
Example #17
0
 def write_splice_bed(self, fh):
     intron_dict = collections.defaultdict(float)
     for strand in Strand.POS, Strand.NEG:
         for t in self.strand_transfrags[strand]:
             if t.is_ref:
                 continue
             for start, end in t.iterintrons():
                 intron_dict[(start, end, strand)] += t.expr
     for intron, expr in intron_dict.iteritems():
         start, end, strand = intron
         fields = [self.chrom, str(start - 1), str(end + 1), 'JUNC',
                   str(expr), Strand.to_gtf(strand),
                   str(start - 1), str(end + 1), '255,0,0',
                   '2', '1,1', '0,%d' % (end + 1 - start)]
         print >>fh, '\t'.join(fields)
Example #18
0
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref):
    '''
    returns list of Transfrag objects
    '''
    t_dict = collections.OrderedDict()
    total_expr = 0.0
    cur_t_id = 1
    for gtf_line in gtf_iter:
        if not gtf_line:
            continue
        if not gtf_line.strip():
            continue
        if gtf_line.startswith("#"):
            continue
        f = GTF.Feature.from_str(gtf_line)
        if f.feature == 'transcript':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id in t_dict:
                raise GTFError("Transcript '%s' duplicate detected" % t_id)
            # rename transcript id
            new_t_id = "%s.%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            # parse expression
            if is_ref:
                expr = 0.0
            else:
                if gtf_expr_attr not in f.attrs:
                    raise GTFError("GTF expression attribute '%s' not found" %
                                   (gtf_expr_attr))
                expr = float(f.attrs[gtf_expr_attr])
                total_expr += expr
            # create transfrag
            t = Transfrag(chrom=f.seqid,
                          strand=Strand.from_gtf(f.strand),
                          _id=new_t_id,
                          expr=float(expr),
                          is_ref=is_ref,
                          exons=None)
            t_dict[t_id] = t
        elif f.feature == 'exon':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id not in t_dict:
                logging.error('Feature: "%s"' % str(f))
                raise GTFError("Transcript '%s' exon feature appeared in "
                               "gtf file prior to transcript feature" % t_id)
            t = t_dict[t_id]
            t.exons.append(Exon(f.start, f.end))
    return t_dict.values(), total_expr
Example #19
0
    def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph while tolerating at most 'loss_threshold' percent of
        expression.
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length()
        if user_kmax > 0:
            # user can force a specific kmax (for debugging/testing purposes)
            kmax = min(user_kmax, kmax)
        id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end,
                                    Strand.to_gtf(self.strand)))

        def compute_kmers(k):
            K = self.create(k)
            tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
            lost_expr = K.lost_kmer_expr
            lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
            if stats_fh:
                fields = [self.chrom, self.start, self.end,
                          Strand.to_gtf(self.strand), k, kmax,
                          len(self.paths), K.n, len(K.short_transfrags),
                          K.num_lost_kmers, tot_expr, lost_expr,
                          lost_expr_frac, int(K.valid)]
                print >>stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            #if lost_expr_frac > loss_threshold:
            #    return -k
            return len(K)

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (id_str, k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (id_str, len(K)))
        num_lost = self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (id_str, num_lost, len(K.short_transfrags)))
        return K, k
Example #20
0
 def compute_kmers(k):
     K = self.create(k)
     tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
     lost_expr = K.lost_kmer_expr
     lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
     if stats_fh:
         fields = [self.chrom, self.start, self.end,
                   Strand.to_gtf(self.strand), k, kmax,
                   len(self.paths), K.n, len(K.short_transfrags),
                   K.num_lost_kmers, tot_expr, lost_expr,
                   lost_expr_frac, int(K.valid)]
         print >>stats_fh, '\t'.join(map(str, fields))
     if not K.valid:
         return -k
     #if lost_expr_frac > loss_threshold:
     #    return -k
     return len(K)
Example #21
0
 def get_stats(self,
               K,
               kmax=None,
               lost_short=0,
               lost_short_expr=0.0,
               is_opt=0):
     if kmax is None:
         kmax = self.longest_path_length
     expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr
     opt = int(round(expr_frac * len(K)))
     fields = [
         self.chrom, self.start, self.end,
         Strand.to_gtf(self.strand), K.k, kmax,
         len(self.paths),
         len(K.short_transfrags), K.short_expr, lost_short, lost_short_expr,
         len(K), K.num_lost_kmers, self.total_expr, K.graph_expr, expr_frac,
         int(K.valid), opt, is_opt
     ]
     return fields
Example #22
0
 def to_bed(self):
     tx_start = self.exons[0].start
     tx_end = self.exons[-1].end
     block_sizes = []
     block_starts = []
     for e in self.exons:
         block_starts.append(e.start - tx_start)
         block_sizes.append(e.end - e.start)
     # make bed fields
     fields = [
         self.chrom,
         str(tx_start),
         str(tx_end), self._id,
         str(self.expr),
         Strand.to_gtf(self.strand), '0', '0', '0',
         str(len(self.exons)), ','.join(map(str, block_sizes)),
         ','.join(map(str, block_starts))
     ]
     return fields
Example #23
0
 def compute_kmers(k):
     K = self.create(k)
     tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
     lost_expr = K.lost_kmer_expr
     lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
     if stats_fh:
         fields = [
             self.chrom, self.start, self.end,
             Strand.to_gtf(self.strand), k, kmax,
             len(self.paths), K.n,
             len(K.short_transfrags), K.num_lost_kmers, tot_expr,
             lost_expr, lost_expr_frac,
             int(K.valid)
         ]
         print >> stats_fh, '\t'.join(map(str, fields))
     if not K.valid:
         return -k
     #if lost_expr_frac > loss_threshold:
     #    return -k
     return len(K)
Example #24
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {
        'locus_id': locus_id,
        'gene_id': gene_id,
        'tss_id': tss_id,
        'transcript_id': transcript_id
    }
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {
        'expr': '%.3f' % expr,
        'rel_frac': '%.5f' % rel_frac,
        'abs_frac': '%.5f' % abs_frac
    }
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f
Example #25
0
 def from_bed(line):
     fields = line.strip().split('\t')
     chrom = fields[0]
     tx_start = int(fields[1])
     _id = fields[3]
     is_ref = (_id.split('.')[0] == Sample.REF_ID)
     expr = float(fields[4])
     strand = Strand.from_bed(fields[5])
     num_exons = int(fields[9])
     block_sizes = fields[10].split(',')
     block_starts = fields[11].split(',')
     exons = []
     for i in xrange(num_exons):
         start = tx_start + int(block_starts[i])
         end = start + int(block_sizes[i])
         exons.append(Exon(start, end))
     return Transfrag(chrom=chrom,
                      strand=strand,
                      _id=_id,
                      expr=expr,
                      is_ref=is_ref,
                      exons=exons)
Example #26
0
    def detect_change_points(self, *args, **kwargs):
        '''
        *args, **kwargs: passed directly to 'run_changepoint'

        returns list of ChangePoint tuples
        '''
        changepts = []
        for n_id in self.G:
            n = self.get_node_interval(n_id)
            expr_data = self.get_expr_data(n.start, n.end)
            for cp in run_changepoint(expr_data, *args, **kwargs):
                # add offset from start of node to change point positions
                cp = cp._replace(pos=n.start + cp.pos,
                                 start=n.start + cp.start,
                                 end=n.start + cp.end)
                changepts.append(cp)
                logging.debug(
                    '\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) '
                    'p=%.3f fc=%.3f' %
                    (self.chrom, self.start, self.end,
                     Strand.to_gtf(self.strand), n.start, n.end, cp.pos,
                     cp.start, cp.end, cp.pvalue, cp.foldchange))
        return changepts
Example #27
0
    def detect_change_points(self, *args, **kwargs):
        '''
        *args, **kwargs: passed directly to 'run_changepoint'

        returns list of ChangePoint tuples
        '''
        genome_id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end,
                         Strand.to_gtf(self.strand)))
        changepts = []
        for n in self.G.nodes_iter():
            expr_data = self.get_expr_data(*n)
            for cp in run_changepoint(expr_data, *args, **kwargs):
                # add offset from start of node to change point positions
                cp = cp._replace(pos=n.start + cp.pos,
                                 start=n.start + cp.start,
                                 end=n.start + cp.end)
                changepts.append(cp)
                logging.debug('%s changepoint node=(%s-%s) '
                              'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' %
                              (genome_id_str, n.start,
                               n.end, cp.pos, cp.start, cp.end,
                               cp.pvalue, cp.foldchange))
        return changepts
Example #28
0
    def detect_change_points(self, *args, **kwargs):
        '''
        *args, **kwargs: passed directly to 'run_changepoint'

        returns list of ChangePoint tuples
        '''
        genome_id_str = (
            '%s:%d-%d[%s]' %
            (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
        changepts = []
        for n in self.G.nodes_iter():
            expr_data = self.get_expr_data(*n)
            for cp in run_changepoint(expr_data, *args, **kwargs):
                # add offset from start of node to change point positions
                cp = cp._replace(pos=n.start + cp.pos,
                                 start=n.start + cp.start,
                                 end=n.start + cp.end)
                changepts.append(cp)
                logging.debug('%s changepoint node=(%s-%s) '
                              'pos=%d interval=(%d-%d) p=%.3f fc=%.3f' %
                              (genome_id_str, n.start, n.end, cp.pos, cp.start,
                               cp.end, cp.pvalue, cp.foldchange))
        return changepts
Example #29
0
    def detect_change_points(self, *args, **kwargs):
        '''
        *args, **kwargs: passed directly to 'run_changepoint'

        returns list of ChangePoint tuples
        '''
        changepts = []
        for n_id in self.G:
            n = self.get_node_interval(n_id)
            expr_data = self.get_expr_data(n.start, n.end)
            for cp in run_changepoint(expr_data, *args, **kwargs):
                # add offset from start of node to change point positions
                cp = cp._replace(pos=n.start + cp.pos,
                                 start=n.start + cp.start,
                                 end=n.start + cp.end)
                changepts.append(cp)
                logging.debug('\t%s:%d-%d[%s] node: %s-%s cp:%d(%d-%d) '
                              'p=%.3f fc=%.3f' %
                              (self.chrom, self.start, self.end,
                               Strand.to_gtf(self.strand), n.start,
                               n.end, cp.pos, cp.start, cp.end,
                               cp.pvalue, cp.foldchange))
        return changepts
Example #30
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'expr': '%.3f' % expr,
               'rel_frac': '%.5f' % rel_frac,
               'abs_frac': '%.5f' % abs_frac}
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f
Example #31
0
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)
    # make bed fields
    fields = [chrom,
              str(tx_start),
              str(tx_end),
              str(name),
              str(score),
              Strand.to_gtf(strand),
              str(tx_start),
              str(tx_start),
              '0',
              str(len(exons)),
              ','.join(map(str, block_sizes)) + ',',
              ','.join(map(str, block_starts)) + ',']
    return fields
Example #32
0
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)
    # make bed fields
    fields = [
        chrom,
        str(tx_start),
        str(tx_end),
        str(name),
        str(score),
        Strand.to_gtf(strand),
        str(tx_start),
        str(tx_start), '0',
        str(len(exons)), ','.join(map(str, block_sizes)) + ',',
        ','.join(map(str, block_starts)) + ','
    ]
    return fields
Example #33
0
 def __str__(self):
     return ('PathGraphFactory %s:%d-%d[%s]' %
             (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
Example #34
0
 def __str__(self):
     return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' %
             (self.chrom, self.start, self.end, Strand.to_gtf(
                 self.strand), len(self.transfrags)))
Example #35
0
 def __str__(self):
     return ('SpliceGraph %s:%d-%d[%s] transfrags: %d' %
             (self.chrom, self.start, self.end,
              Strand.to_gtf(self.strand), len(self.transfrags)))
Example #36
0
def assemble_isoforms(sgraph, config):
    # create a path graph from the splice graph
    K, k = create_optimal_path_graph(
        sgraph,
        kmax=config.path_graph_kmax,
        loss_threshold=config.path_graph_loss_threshold,
        stats_fh=config.path_graph_stats_fh)
    if K is None:
        return []
    if len(K) == 0:
        return []

    # report lost nodes
    if config.assembly_loss_gtf_fh is not None:
        graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                        Strand.to_gtf(sgraph.strand)))
        for n_id in get_lost_nodes(sgraph, K):
            n = sgraph.get_node_interval(n_id)
            expr_data = sgraph.get_node_expr_data(n_id)
            # return gtf feature for each node
            f = GTF.Feature()
            f.seqid = sgraph.chrom
            f.source = 'taco'
            f.feature = 'lost_node'
            f.start = n[0]
            f.end = n[1]
            f.score = 0.0
            f.strand = Strand.to_gtf(sgraph.strand)
            f.phase = '.'
            f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())}
            print >> config.assembly_loss_gtf_fh, str(f)

    # smooth kmer graph
    smooth_graph(K)

    source_node = K.graph['source']
    source_expr = K.node[source_node][KMER_EXPR]
    logging.debug('%s:%d-%d[%s] finding paths in k=%d graph '
                  '(%d nodes) source_expr=%f' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), k, len(K), source_expr))
    id_kmer_map = K.graph['id_kmer_map']

    paths = []
    for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac,
                                      config.max_paths):
        path = reconstruct_path(kmer_path, id_kmer_map, sgraph)
        logging.debug("\texpr=%f length=%d" % (expr, len(path)))
        paths.append((path, expr))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('\tclusters: %d filtered: %d' %
                  (len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
Example #37
0
 def __str__(self):
     return ('PathGraphFactory %s:%d-%d[%s]' %
             (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))
Example #38
0
def assemble_isoforms(sgraph, config):
    # create a path graph from the splice graph
    K, k = create_optimal_path_graph(
        sgraph,
        kmax=config.path_graph_kmax,
        loss_threshold=config.path_graph_loss_threshold,
        stats_fh=config.path_graph_stats_fh)
    if K is None:
        return []
    if len(K) == 0:
        return []

    # report lost nodes
    if config.assembly_loss_gtf_fh is not None:
        graph_id = ('L_%s:%d-%d[%s]' %
                    (sgraph.chrom, sgraph.start, sgraph.end,
                     Strand.to_gtf(sgraph.strand)))
        for n_id in get_lost_nodes(sgraph, K):
            n = sgraph.get_node_interval(n_id)
            expr_data = sgraph.get_node_expr_data(n_id)
            # return gtf feature for each node
            f = GTF.Feature()
            f.seqid = sgraph.chrom
            f.source = 'taco'
            f.feature = 'lost_node'
            f.start = n[0]
            f.end = n[1]
            f.score = 0.0
            f.strand = Strand.to_gtf(sgraph.strand)
            f.phase = '.'
            f.attrs = {'graph_id': graph_id,
                       'expr': str(expr_data.mean())}
            print >>config.assembly_loss_gtf_fh, str(f)

    # smooth kmer graph
    smooth_graph(K)

    source_node = K.graph['source']
    source_expr = K.node[source_node][KMER_EXPR]
    logging.debug('%s:%d-%d[%s] finding paths in k=%d graph '
                  '(%d nodes) source_expr=%f' %
                  (sgraph.chrom, sgraph.start, sgraph.end,
                   Strand.to_gtf(sgraph.strand), k, len(K),
                   source_expr))
    id_kmer_map = K.graph['id_kmer_map']

    paths = []
    for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac,
                                      config.max_paths):
        path = reconstruct_path(kmer_path, id_kmer_map, sgraph)
        logging.debug("\texpr=%f length=%d" % (expr, len(path)))
        paths.append((path, expr))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('\tclusters: %d filtered: %d' %
                  (len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms