def prune_transcript_graph(G, strand, transcript_map, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0): ''' trim_utr_fraction: float specifying the fraction of the average UTR coverage below which the ends of the UTR will be trimmed trim_intron_fraction: float specifying the fraction of the average intron coverage below which intronic nodes will be removed ''' # trim utrs and intron retentions trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction) G.remove_nodes_from(trim_nodes) # collapse consecutive nodes in graph H = collapse_strand_specific_graph(G, transcript_map, introns=True) # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = nx.weakly_connected_component_subgraphs(H) for Gsub in Gsubs: # get partial path data supporting graph transcript_node_map = get_transcript_node_map(Gsub) path_score_dict = collections.defaultdict(lambda: 0) for t_id, nodes in transcript_node_map.iteritems(): # reverse path for negative strand transcripts if strand == NEG_STRAND: nodes.reverse() # get transcript scores t = transcript_map[t_id] path_score_dict[tuple(nodes)] += t.score yield Gsub, strand, path_score_dict.items()
def create_transcript_graphs(chrom, transcripts, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0, create_bedgraph=False, bedgraph_filehs=None): ''' generates (graph, strand, transcript_map) tuples with transcript graphs ''' def get_bedgraph_lines(chrom, G): for n in sorted(G.nodes()): if n.start < 0: continue fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) yield fields # partition transcripts by strand and resolve unstranded transcripts logging.debug("\tResolving unstranded transcripts") strand_transcript_lists, strand_ref_transcripts = \ partition_transcripts_by_strand(transcripts) # create strand-specific graphs using redistributed score logging.debug("\tCreating transcript graphs") transcript_graphs = [] for strand, transcript_list in enumerate(strand_transcript_lists): # create strand specific transcript graph G = create_directed_graph(strand, transcript_list) # output bedgraph if create_bedgraph: for fields in get_bedgraph_lines(chrom, G): print >>bedgraph_filehs[strand], '\t'.join(map(str,fields)) # trim utrs and intron retentions trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction) G.remove_nodes_from(trim_nodes) # collapse consecutive nodes in graph H, node_chain_map = collapse_strand_specific_graph(G, introns=True) # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = nx.weakly_connected_component_subgraphs(H) # add components as separate transcript graphs strand_graphs = [] node_subgraph_map = {} for i,Gsub in enumerate(Gsubs): for n in Gsub: node_subgraph_map[n] = i tg = TranscriptGraph(chrom, strand, Gsub) tg.partial_paths = collections.defaultdict(lambda: 0.0) strand_graphs.append(tg) # populate transcript graphs with partial paths for t in transcript_list: # get original transcript nodes and subtract trimmed nodes # convert to collapsed nodes and bin according to subgraph # TODO: intronic transcripts may be split into multiple pieces, # should we allow this? subgraph_node_map = collections.defaultdict(lambda: set()) for n in split_exons(t, G.graph['boundaries']): n = Exon(*n) if n in trim_nodes: continue cn = node_chain_map[n] subgraph_id = node_subgraph_map[cn] subgraph_node_map[subgraph_id].add(cn) # add transcript node/score pairs to subgraphs for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems(): subgraph_nodes = sorted(subgraph_nodes, key=operator.attrgetter('start'), reverse=(strand == NEG_STRAND)) tg = strand_graphs[subgraph_id] tg.partial_paths[tuple(subgraph_nodes)] += t.score transcript_graphs.extend(strand_graphs) # convert for tg in transcript_graphs: tg.partial_paths = tg.partial_paths.items() return transcript_graphs
def create_transcript_graphs(chrom, transcripts, min_trim_length=0, trim_utr_fraction=0.0, trim_intron_fraction=0.0, create_bedgraph=False, bedgraph_filehs=None): ''' generates (graph, strand, transcript_map) tuples with transcript graphs ''' def get_bedgraph_lines(chrom, G): for n in sorted(G.nodes()): if n.start < 0: continue fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) yield fields # partition transcripts by strand and resolve unstranded transcripts logging.debug("\tResolving unstranded transcripts") strand_transcript_lists, strand_ref_transcripts = \ partition_transcripts_by_strand(transcripts) # create strand-specific graphs using redistributed score logging.debug("\tCreating transcript graphs") transcript_graphs = [] for strand, transcript_list in enumerate(strand_transcript_lists): # create strand specific transcript graph G = create_directed_graph(strand, transcript_list) # output bedgraph if create_bedgraph: for fields in get_bedgraph_lines(chrom, G): print >> bedgraph_filehs[strand], '\t'.join(map(str, fields)) # trim utrs and intron retentions trim_nodes = trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction) G.remove_nodes_from(trim_nodes) # collapse consecutive nodes in graph H, node_chain_map = collapse_strand_specific_graph(G, introns=True) # get connected components of graph which represent independent genes # unconnected components are considered different genes Gsubs = nx.weakly_connected_component_subgraphs(H) # add components as separate transcript graphs strand_graphs = [] node_subgraph_map = {} for i, Gsub in enumerate(Gsubs): for n in Gsub: node_subgraph_map[n] = i tg = TranscriptGraph(chrom, strand, Gsub) tg.partial_paths = collections.defaultdict(lambda: 0.0) strand_graphs.append(tg) # populate transcript graphs with partial paths for t in transcript_list: # get original transcript nodes and subtract trimmed nodes # convert to collapsed nodes and bin according to subgraph # TODO: intronic transcripts may be split into multiple pieces, # should we allow this? subgraph_node_map = collections.defaultdict(lambda: set()) for n in split_exons(t, G.graph['boundaries']): n = Exon(*n) if n in trim_nodes: continue cn = node_chain_map[n] subgraph_id = node_subgraph_map[cn] subgraph_node_map[subgraph_id].add(cn) # add transcript node/score pairs to subgraphs for subgraph_id, subgraph_nodes in subgraph_node_map.iteritems(): subgraph_nodes = sorted(subgraph_nodes, key=operator.attrgetter('start'), reverse=(strand == NEG_STRAND)) tg = strand_graphs[subgraph_id] tg.partial_paths[tuple(subgraph_nodes)] += t.score transcript_graphs.extend(strand_graphs) # convert for tg in transcript_graphs: tg.partial_paths = tg.partial_paths.items() return transcript_graphs