def _trim_change_point(self, cp):
     # search for matches in change point interval
     num_trimmed = 0
     if cp.sign < 0:
         hits = self.tree_ends.find(cp.pos, cp.end)
         for hit in hits:
             t = hit.value
             # last exon start cannot overlap interval
             last_exon_start = t.exons[-1][0]
             if cp.pos <= last_exon_start <= cp.end:
                 continue
             # trim end left
             t.exons[-1] = Exon(last_exon_start, cp.pos)
             num_trimmed += 1
     else:
         hits = self.tree_starts.find(cp.start, cp.pos)
         for hit in hits:
             t = hit.value
             # first exon end cannot overlap interval
             first_exon_end = t.exons[0][1]
             if cp.start <= first_exon_end <= cp.pos:
                 continue
             # trim start right
             t.exons[0] = Exon(cp.pos, first_exon_end)
             num_trimmed += 1
Beispiel #2
0
def get_transcripts(stream):
    """Collect transcripts as collections of CDS/exons"""
    transcripts = {}
    
    for line__ in stream:
        if isinstance(line__, str):
            line = line__
        elif isinstance(line__, bytes):
            line = line__.decode()
        else:
            raise InputError('Unsupported stream data')

        if line.startswith('#') or not line.strip():
            continue
        
        f = parse(line)
        if f.feature == 'exon':
            m = re.search('transcript_id "([a-zA-Z0-9\.]+)";', f.attribute)
            if not m:
                raise ValueError('Gene id could not be found')
            index = m.group(1)
            exons = []
            if index not in transcripts:
                transcripts[index] = mRNA(seqid = f.seqid, start = None,
                                          end = None, strand = f.strand,
                                          exons = [], attributes = '')
                
            transcripts[index].exons.append(Exon(seqid = f.seqid,
                                                 start = f.start, end = f.end,
                                                 strand = f.strand))

    return transcripts
 def _create_splice_graph(self):
     '''returns networkx DiGraph object'''
     G = nx.DiGraph()
     node_bounds = self.node_bounds
     node_id_map = {}
     id_node_map = {}
     current_id = 2
     for t in self.itertransfrags():
         # split exons that cross boundaries and get the
         # nodes that made up the transfrag
         nodes = []
         for n in split_transfrag(t, node_bounds):
             n = Exon(*n)
             if n not in node_id_map:
                 n_id = current_id
                 current_id += 1
                 node_id_map[n] = n_id
                 id_node_map[n_id] = n
             else:
                 n_id = node_id_map[n]
             nodes.append(n_id)
         if t.strand == Strand.NEG:
             nodes.reverse()
         # add nodes/edges to graph
         u = nodes[0]
         SGNode.add(G, u, id_node_map[u], is_ref=t.is_ref)
         for i in xrange(1, len(nodes)):
             v = nodes[i]
             SGNode.add(G, v, id_node_map[v], is_ref=t.is_ref)
             G.add_edge(u, v)
             u = v
     G.graph['node_id_map'] = node_id_map
     G.graph['id_node_map'] = id_node_map
     return G
Beispiel #4
0
    def parse_gtf(gtf_lines, ignore_ref=True):
        '''
        returns OrderedDict key is transcript_id value is Transfrag
        '''
        t_dict = collections.OrderedDict()
        for gtf_line in gtf_lines:
            f = GTF.Feature.from_str(gtf_line)
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            is_ref = bool(int(f.attrs.get(GTF.Attr.REF, '0')))

            if is_ref and ignore_ref:
                continue

            if f.feature == 'transcript':
                if t_id in t_dict:
                    raise GTFError("Transcript '%s' duplicate detected" % t_id)
                t = Transfrag.from_gtf(f)
                t_dict[t_id] = t
            elif f.feature == 'exon':
                if t_id not in t_dict:
                    logging.error('Feature: "%s"' % str(f))
                    raise GTFError("Transcript '%s' exon feature appeared in "
                                   "gtf file prior to transcript feature" %
                                   t_id)
                t = t_dict[t_id]
                t.exons.append(Exon(f.start, f.end))
        return t_dict
Beispiel #5
0
 def _create_splice_graph(self):
     G = SGraph()
     for t in self.itertransfrags():
         nodes = [Exon(*n) for n in split_transfrag(t, self.node_bounds)]
         if self.strand == Strand.NEG:
             nodes.reverse()
         G.add_path(nodes, t.is_ref)
     # after graph is built, set node properties for start and stop nodes
     G.set_start_stop_nodes()
     return G
Beispiel #6
0
 def reconstruct_exons(self, path):
     # reverse negative stranded data so that all paths go from
     # small -> large genomic coords
     if self.strand == Strand.NEG:
         path.reverse()
     # convert from integer node labels to genome (start, end) tuples
     path = [self.get_node_interval(nid) for nid in path]
     # collapse contiguous nodes along path
     newpath = []
     chain = [path[0]]
     for v in path[1:]:
         if chain[-1].end != v.start:
             # update path with merge chain node
             newpath.append(Exon(chain[0].start, chain[-1].end))
             # reset chain
             chain = []
         chain.append(v)
     # add last chain
     newpath.append(Exon(chain[0].start, chain[-1].end))
     return newpath
Beispiel #7
0
def reconstruct_path(kmer_path, id_kmer_map, sgraph):
    # reconstruct path from kmer ids
    path = list(id_kmer_map[kmer_path[1]])
    path.extend(id_kmer_map[n][-1] for n in kmer_path[2:-1])
    # reverse negative stranded data so that all paths go from
    # small -> large genomic coords
    if sgraph.strand == Strand.NEG:
        path.reverse()
    # convert from integer node labels to genome (start, end) tuples
    path = [sgraph.get_node_interval(nid) for nid in path]
    # collapse contiguous nodes along path
    newpath = []
    chain = [path[0]]
    for v in path[1:]:
        if chain[-1].end != v.start:
            # update path with merge chain node
            newpath.append(Exon(chain[0].start, chain[-1].end))
            # reset chain
            chain = []
        chain.append(v)
    # add last chain
    newpath.append(Exon(chain[0].start, chain[-1].end))
    return newpath
Beispiel #8
0
def parse_gtf(gtf_iter, sample_id, gtf_expr_attr, is_ref):
    '''
    returns list of Transfrag objects
    '''
    t_dict = collections.OrderedDict()
    total_expr = 0.0
    cur_t_id = 1
    for gtf_line in gtf_iter:
        if not gtf_line:
            continue
        if not gtf_line.strip():
            continue
        if gtf_line.startswith("#"):
            continue
        f = GTF.Feature.from_str(gtf_line)
        if f.feature == 'transcript':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id in t_dict:
                raise GTFError("Transcript '%s' duplicate detected" % t_id)
            # rename transcript id
            new_t_id = "%s.%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            # parse expression
            if is_ref:
                expr = 0.0
            else:
                if gtf_expr_attr not in f.attrs:
                    raise GTFError("GTF expression attribute '%s' not found" %
                                   (gtf_expr_attr))
                expr = float(f.attrs[gtf_expr_attr])
                total_expr += expr
            # create transfrag
            t = Transfrag(chrom=f.seqid,
                          strand=Strand.from_gtf(f.strand),
                          _id=new_t_id,
                          expr=float(expr),
                          is_ref=is_ref,
                          exons=None)
            t_dict[t_id] = t
        elif f.feature == 'exon':
            t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
            if t_id not in t_dict:
                logging.error('Feature: "%s"' % str(f))
                raise GTFError("Transcript '%s' exon feature appeared in "
                               "gtf file prior to transcript feature" % t_id)
            t = t_dict[t_id]
            t.exons.append(Exon(f.start, f.end))
    return t_dict.values(), total_expr
Beispiel #9
0
def get_chains(G, introns=True):
    """
    group nodes into chains
    
    returns a dict mapping node -> chain, as well as a 
    dict mapping chains to nodes
    """
    if introns:
        can_collapse_func = can_collapse
    else:
        can_collapse_func = can_collapse_contiguous
    imin2 = lambda x, y: x if x <= y else y
    imax2 = lambda x, y: x if x >= y else y
    node_chain_map = {}
    chains = {}
    # initialize each node to be in a "chain" by itself
    for n in G.nodes_iter():
        node_chain_map[n] = n
        chains[n] = set((n, ))
    for u, v in G.edges_iter():
        if not can_collapse_func(G, u, v):
            continue
        # get chains containing these nodes
        u_new = node_chain_map[u]
        u_chain = chains[u_new]
        del chains[u_new]
        v_new = node_chain_map[v]
        v_chain = chains[v_new]
        del chains[v_new]
        # merge chains
        merged_chain = u_chain.union(v_chain)
        merged_node = Exon(imin2(u_new.start, v_new.start),
                           imax2(u_new.end, v_new.end))
        # point all nodes in chain to new parent
        for n in merged_chain:
            node_chain_map[n] = merged_node
        chains[merged_node] = merged_chain
    # sort chain nodes by genome position and store as list
    for parent in chains:
        chains[parent] = sorted(chains[parent],
                                key=operator.attrgetter('start'))
    return node_chain_map, chains
Beispiel #10
0
 def from_bed(line):
     fields = line.strip().split('\t')
     chrom = fields[0]
     tx_start = int(fields[1])
     _id = fields[3]
     is_ref = (_id.split('.')[0] == Sample.REF_ID)
     expr = float(fields[4])
     strand = Strand.from_bed(fields[5])
     num_exons = int(fields[9])
     block_sizes = fields[10].split(',')
     block_starts = fields[11].split(',')
     exons = []
     for i in xrange(num_exons):
         start = tx_start + int(block_starts[i])
         end = start + int(block_sizes[i])
         exons.append(Exon(start, end))
     return Transfrag(chrom=chrom,
                      strand=strand,
                      _id=_id,
                      expr=expr,
                      is_ref=is_ref,
                      exons=exons)
Beispiel #11
0
 def parse_gtf(gtf_lines):
     '''
     returns list of Transfrag objects
     '''
     t_dict = collections.OrderedDict()
     for gtf_line in gtf_lines:
         f = GTF.Feature.from_str(gtf_line)
         if f.feature == 'transcript':
             t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
             if t_id in t_dict:
                 raise GTFError("Transcript '%s' duplicate detected" % t_id)
             t = Transfrag.from_gtf(f)
             t_dict[t_id] = t
         elif f.feature == 'exon':
             t_id = f.attrs[GTF.Attr.TRANSCRIPT_ID]
             if t_id not in t_dict:
                 logging.error('Feature: "%s"' % str(f))
                 raise GTFError("Transcript '%s' exon feature appeared in "
                                "gtf file prior to transcript feature" %
                                t_id)
             t = t_dict[t_id]
             t.exons.append(Exon(f.start, f.end))
     return t_dict
Beispiel #12
0
def get_mrnas(stream, source=None):
    """Collect mRNA extents with exons"""
    mrnas = {}

    for line__ in stream:
        if isinstance(line__, str):
            line = line__
        elif isinstance(line__, bytes):
            line = line__.decode()
        else:
            raise InputError('Unsupported stream data')

        if line.startswith('#'):
            continue

        f = parse(line)

        if source is not None and f.source != source:
            continue


#        if f.feature in ['mRNA', 'transcript', 'primary_transcript', 'miRNA',
#                         'lnc_RNA', 'gene', 'snoRNA', 'antisense_RNA']:
#        if f.feature in ['mRNA', 'transcript']:
        if f.feature == 'mRNA':
            #            m = re.search('ID=(rna\d\d*);', f.attribute)
            m = re.search('ID=(\w[\w:]*)', f.attribute)
            if not m:
                raise ValueError('mRNA id could not be found')
            index = m.group(1)
            exons = []
            if index in mrnas:
                if mrnas[index].start is not None:
                    raise RuntimeError(
                        'mRNA with the same id is already present')
                exons = mrna[index].exons

            attributes = {}
            for a in f.attribute.rstrip().split(';'):
                r = a.split('=')
                if (len(r) == 2):
                    attributes[r[0]] = r[1]

            mrnas[index] = mRNA(seqid=f.seqid,
                                start=f.start,
                                end=f.end,
                                strand=f.strand,
                                exons=exons,
                                attributes=attributes)

        if f.feature == 'exon':
            #            m = re.search('Parent=(rna\d\d*)', f.attribute)
            m = re.search('Parent=(\w[\w:]*)', f.attribute)
            if not m:
                # there seem to be exons not assigned to mRNAs
                #                raise ValueError('Exon without parent: {0}'.format(f.attribute))
                print('WARNING: Exon without parent: {0}'.format(line))
                continue

            index = m.group(1)
            if index not in mrnas:
                #                raise  ValueError('Parent of the exon not found: {0}'.format(line))
                #                mrnas[index] = mRNA(seqid = f.seqid, start = None, end = None,
                #                                    strand = None, exons = [])
                continue

            mrnas[index].exons.append(
                Exon(seqid=f.seqid, start=f.start, end=f.end, strand=f.strand))

        # these things appear in RNA-seq, but are not mRNA
        if f.feature == 'five_prime_UTR':
            m = re.search('ID=(id\d\d*);', f.attribute)
            if not m:
                raise ValueError("5'UTR id could not be found")

            index = m.group(1)
            if index not in mrnas:
                mrnas[index] = mRNA(seqid=f.seqid,
                                    start=None,
                                    end=None,
                                    strand=None,
                                    exons=[],
                                    attributes={})

            mrnas[index].exons.append(
                Exon(seqid=f.seqid, start=f.start, end=f.end, strand=f.strand))

        # this is a hack
        # there are introns with no exons in the gff file, we create fake
        # exons for easier processing
        if f.feature == 'intron':
            m = re.search('ID=(id\d\d*);', f.attribute)
            if not m:
                raise ValueError('Intron id could not be found')

            index = m.group(1)
            if index in mrnas:
                raise RuntimeError('mRNA element already present')

            exon1 = Exon(seqid=f.seqid,
                         start=f.start - 2,
                         end=f.start - 1,
                         strand=f.strand)
            exon2 = Exon(seqid=f.seqid,
                         start=f.end + 1,
                         end=f.end + 2,
                         strand=f.strand)
            mrnas[index] = mRNA(seqid=f.seqid,
                                start=None,
                                end=None,
                                strand=None,
                                exons=[exon1, exon2],
                                attributes={})

    return mrnas