def scaffold(args): logging.info('Creating the scaffold graph') g = load_from_fasta_tsv(args.fasta, args.edges, args.containment) print_stats(g) # delete small vertices if args.min_ctg_len: logging.info('Removing vertices smaller than %d bp' % args.min_ctg_len) n_removed = 0 for v in g.vertices: if len(v.seq) < args.min_ctg_len: g.remove_vertex(v) n_removed += 1 logging.info('Removed %d vertices' % n_removed) print_stats(g) # prune scaffold edges if g.edges: logging.info('Simplifying the graph using paired-end reads') logging.info('Contracting unambigous paths') contract_edges(g, store_ordering=True) print_stats(g) save_fasta(g, 'contracted.fasta') if args.cut_tip_len: n_cut = cut_tips(g, d=args.cut_tip_len) logging.info('Cut %d tips shorter than %d bp' \ % (n_cut, args.cut_tip_len)) logging.info('Pruning edges with low support') n_pruned1 = prune_scaffold_edges(g, abs_support_thr=args.pe_abs_thr, rel_support_thr=args.pe_rel_thr) n_pruned2 = prune_scaffold_edges_via_wells(g, thr=args.pe_rc_rel_thr) logging.info('%d edges pruned' % (n_pruned1 + n_pruned2)) logging.info('Contracting unambigous paths') n_contracted = contract_edges(g) print_stats(g) # delete all existing edges from the graph E = g.edges for e in E: g.remove_edge(e) # create new edges whenever vertices have similar well profiles logging.info('Creating edges from read clouds') n_edges = make_wellscaff_edges(g, min_common=args.rc_abs_thr, min_thr=args.rc_rel_edge_thr) logging.info('%d scaffold edges from read clouds' % n_edges) logging.info('Auto-saving graph with prefix %s.wellscaff' % args.out) save_to_fasta_tsv(g, '%s.wellscaff.fasta' % args.out, '%s.wellscaff.tsv' % args.out, '%s.wellscaff.containment' % args.out) logging.info('Pruning edges with low support') n_pruned = prune_via_wells(g, min_common=args.rc_abs_thr, min_thr=args.rc_rel_prun_thr) logging.info('%d edges pruned' % n_pruned) logging.info('Contracting unambigous paths') n_contracted = contract_edges(g, store_ordering=True) print_stats(g) logging.info('Saving scaffolding results') save_fasta(g, '%s.fasta' % args.out) save_ordering(g, '%s.ordering' % args.out)
def scaffold_via_wells_mst(g): # initialize internal contig labels (used for downstream qc) for v in g.vertices: v.initialize_contigs() # construct well-based scaffold graph in networkx format nxg = g.nxgraph # nxg = _construct_graph(g) # weigh edges according to how many wells they are sharing: _reweigh_edges(nxg, g, type_='wells') # find the maxinum spanning forest msf = nx.minimum_spanning_tree(nxg) # keep simplifying the graph until the msf has no branching nodes: n_iter = 1 while _has_branches(msf) and n_iter <= 10: print 'MSF simplificaiton iteration %d' % n_iter # print '...', max(msf.degree(weight=None).values()) # print '...', sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[:10] # vg = sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[0][0] # v = g.vertex_from_id(vg[0]) # N = [n.id for n in g.vertices if v in n.neighbors] # print ',,,', N # print msf.neighbors(v) # remove edges of g not selected in forest MSF E = [e for e in g.edges] n_removed = 0 for e in E: e_nx = ((e.v1.id, e.connection[e.v1]), (e.v2.id, e.connection[e.v2])) if not msf.has_edge(*e_nx): g.remove_edge(e) n_removed += 1 print '%d edges not in MST removed.' % n_removed # contract edges n_contracted = contract_edges(g, store_ordering=True) print '%d edges contracted.' % n_contracted # now we are going to compute the trunk # get the networkx graph again nxg = g.nxgraph _reweigh_edges(nxg, g, type_='wells') # FIXME: do this once # recompute the maxinum spanning forest msf = nx.minimum_spanning_tree(g.nxgraph) # for each tree in forest: trunk = list() for mst in nx.connected_component_subgraphs(msf): # add to mst trunk if len(mst) >= 4: trunk.extend(_mst_trunk(mst, g)) # remove edges not in trunk: E = [e for e in g.edges] print trunk trunk_v = set([v[0] for v in trunk]) n_removed = 0 for e in E: v1_id, v2_id = e.v1.id, e.v2.id if v1_id not in trunk_v or v2_id not in trunk_v: g.remove_edge(e) n_removed += 1 if n_iter >= 4: keyboard() print '%d edges not in trunk removed.' % n_removed # contract one last time n_contracted = contract_edges(g, store_ordering=True) print '%d edges contracted.' % n_contracted # construct well-based scaffold graph in networkx format nxg = g.nxgraph # nxg = _construct_graph(g) # weigh edges according to how many wells they are sharing: _reweigh_edges(nxg, g, type_='wells') # find the maxinum spanning forest msf = nx.minimum_spanning_tree(nxg) n_iter += 1
def scaffold_via_wells_mst(g): # initialize internal contig labels (used for downstream qc) for v in g.vertices: v.initialize_contigs() # construct well-based scaffold graph in networkx format nxg = g.nxgraph # nxg = _construct_graph(g) # weigh edges according to how many wells they are sharing: _reweigh_edges(nxg, g, type_="wells") # find the maxinum spanning forest msf = nx.minimum_spanning_tree(nxg) # keep simplifying the graph until the msf has no branching nodes: n_iter = 1 while _has_branches(msf) and n_iter <= 10: print "MSF simplificaiton iteration %d" % n_iter # print '...', max(msf.degree(weight=None).values()) # print '...', sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[:10] # vg = sorted(msf.degree(weight=None).iteritems(), key=lambda x: x[1], reverse=True)[0][0] # v = g.vertex_from_id(vg[0]) # N = [n.id for n in g.vertices if v in n.neighbors] # print ',,,', N # print msf.neighbors(v) # remove edges of g not selected in forest MSF E = [e for e in g.edges] n_removed = 0 for e in E: e_nx = ((e.v1.id, e.connection[e.v1]), (e.v2.id, e.connection[e.v2])) if not msf.has_edge(*e_nx): g.remove_edge(e) n_removed += 1 print "%d edges not in MST removed." % n_removed # contract edges n_contracted = contract_edges(g, store_ordering=True) print "%d edges contracted." % n_contracted # now we are going to compute the trunk # get the networkx graph again nxg = g.nxgraph _reweigh_edges(nxg, g, type_="wells") # FIXME: do this once # recompute the maxinum spanning forest msf = nx.minimum_spanning_tree(g.nxgraph) # for each tree in forest: trunk = list() for mst in nx.connected_component_subgraphs(msf): # add to mst trunk if len(mst) >= 4: trunk.extend(_mst_trunk(mst, g)) # remove edges not in trunk: E = [e for e in g.edges] print trunk trunk_v = set([v[0] for v in trunk]) n_removed = 0 for e in E: v1_id, v2_id = e.v1.id, e.v2.id if v1_id not in trunk_v or v2_id not in trunk_v: g.remove_edge(e) n_removed += 1 if n_iter >= 4: keyboard() print "%d edges not in trunk removed." % n_removed # contract one last time n_contracted = contract_edges(g, store_ordering=True) print "%d edges contracted." % n_contracted # construct well-based scaffold graph in networkx format nxg = g.nxgraph # nxg = _construct_graph(g) # weigh edges according to how many wells they are sharing: _reweigh_edges(nxg, g, type_="wells") # find the maxinum spanning forest msf = nx.minimum_spanning_tree(nxg) n_iter += 1