def matches(args): if args.sa64: idx = reveallib64.index( sa=args.sa1, lcp=args.lcp1, cache=args.cache ) #enable preconstruction of first SA and LCP array else: idx = reveallib.index( sa=args.sa1, lcp=args.lcp1, cache=args.cache ) #enable preconstruction of first SA and LCP array G = nx.DiGraph() G.graph['paths'] = [] t = IntervalTree() reffile = os.path.basename(args.reference) ctgfile = os.path.basename(args.contigs) ref2length = dict() idx.addsample(reffile) if args.reference.endswith(".gfa"): read_gfa(args.reference, idx, t, G) else: G.graph['paths'].append(reffile) for name, seq in fasta_reader(args.reference): ref2length[name] = len(seq) intv = idx.addsequence(seq) intv = Interval(intv[0], intv[1], name) t.add(intv) G.add_node(intv, offsets={reffile: 0}) contig2length = dict() idx.addsample(ctgfile) if args.contigs.endswith(".gfa"): read_gfa(args.contigs, idx, t, G) else: G.graph['paths'].append(ctgfile) for name, seq in fasta_reader(args.contigs): contig2length[name] = len(seq) intv = idx.addsequence(seq) intv = Interval(intv[0], intv[1], name) t.add(intv) G.add_node(intv, offsets={ctgfile: 0}) #map nodes to connected components in the graph refnode2component = dict() ctgnode2component = dict() component2refnode = dict() component2ctgnode = dict() refcomponents = [] ctgcomponents = [] ctg2ref = dict() ri = 0 ci = 0 for nodes in nx.connected_components(G.to_undirected()): nodes = list(nodes) if reffile in G.node[nodes[0]]['offsets']: for node in nodes: assert (reffile in G.node[node]['offsets']) #check the graph is valid refnode2component[node] = ri component2refnode[ri] = node ri += 1 refcomponents.append(nodes) else: for node in nodes: assert (ctgfile in G.node[node]['offsets']) #check the graph is valid ctgnode2component[node] = ci component2ctgnode[ci] = node ci += 1 ctgcomponents.append(nodes) #for each contig, print the length for name in contig2length: print "#%s\t%d" % (name, contig2length[name]) idx.construct() if args.uniq: print "##refname\trefstart\tctgname\tctgstart\tlength\tn\torient" for mem in idx.getmums(args.minlength): refstart = mem[2][0] ctgstart = mem[2][1] rnode = t[refstart].pop( ) #start position on match to node in graph cnode = t[ctgstart].pop() print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( rnode[2], refstart - rnode[0], cnode[2], ctgstart - cnode[0], mem[0], mem[1], 0) else: print "##refname\trefstart\tctgname\tctgstart\tlength\tn\tunique\torient" for mem in idx.getmems(args.minlength): refstart = mem[2][0] ctgstart = mem[2][1] rnode = t[refstart].pop( ) #start position on match to node in graph cnode = t[ctgstart].pop() print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( rnode[2], refstart - rnode[0], cnode[2], ctgstart - cnode[0], mem[0], mem[1], mem[3], 0) if args.rc: logging.debug("Indexing reverse complement...\n") ### index reverse complement if args.sa64: idx = reveallib64.index( sa=args.sa2, lcp=args.lcp2 ) #enable preconstruction of second SA and LCP array else: idx = reveallib.index( sa=args.sa2, lcp=args.lcp2 ) #enable preconstruction of second SA and LCP array rcG = nx.DiGraph() t = IntervalTree() idx.addsample(reffile) if args.reference.endswith(".gfa"): read_gfa(args.reference, idx, t, rcG) else: rcG.graph['paths'] = set([reffile]) for name, seq in fasta_reader(args.reference): intv = idx.addsequence(seq) intv = Interval(intv[0], intv[1], name) t.add(intv) rcG.add_node(intv, offsets={reffile: 0}, aligned=0) refseq = seq idx.addsample(ctgfile) if args.contigs.endswith(".gfa"): read_gfa(args.contigs, idx, t, rcG, revcomp=True) else: rcG.graph['paths'] = set([ctgfile]) for name, seq in fasta_reader(args.contigs): intv = idx.addsequence(rc(seq)) intv = Interval(intv[0], intv[1], name) t.add(intv) rcG.add_node(intv, offsets={ctgfile: 0}, aligned=0) idx.construct() if args.uniq: for mem in idx.getmums(args.minlength): refstart = mem[2][0] ctgstart = mem[2][1] rnode = t[refstart].pop( ) #start position on match to node in graph cnode = t[ctgstart].pop() l = cnode[1] - cnode[0] print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( rnode[2], refstart - rnode[0], cnode[2], l - ((ctgstart - cnode[0]) + mem[0]), mem[0], mem[1], 1) else: for mem in idx.getmems(args.minlength): refstart = mem[2][0] ctgstart = mem[2][1] rnode = t[refstart].pop( ) #start position on match to node in graph cnode = t[ctgstart].pop() l = cnode[1] - cnode[0] print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( rnode[2], refstart - rnode[0], cnode[2], l - ((ctgstart - cnode[0]) + mem[0]), mem[0], mem[1], mem[3], 1)
def matches(args): if args.sa64: idx=reveallib64.index(sa=args.sa1, lcp=args.lcp1, cache=args.cache) #enable preconstruction of first SA and LCP array else: idx=reveallib.index(sa=args.sa1, lcp=args.lcp1, cache=args.cache) #enable preconstruction of first SA and LCP array G=nx.DiGraph() G.graph['paths']=[] t=IntervalTree() reffile=os.path.basename(args.reference) ctgfile=os.path.basename(args.contigs) ref2length=dict() idx.addsample(reffile) if args.reference.endswith(".gfa"): read_gfa(args.reference,idx,t,G) else: G.graph['paths'].append(reffile) for name,seq in fasta_reader(args.reference): ref2length[name]=len(seq) intv=idx.addsequence(seq) intv=Interval(intv[0],intv[1],name) t.add(intv) G.add_node(intv,offsets={reffile:0}) contig2length=dict() idx.addsample(ctgfile) if args.contigs.endswith(".gfa"): read_gfa(args.contigs,idx,t,G) else: G.graph['paths'].append(ctgfile) for name,seq in fasta_reader(args.contigs): contig2length[name]=len(seq) intv=idx.addsequence(seq) intv=Interval(intv[0],intv[1],name) t.add(intv) G.add_node(intv,offsets={ctgfile:0}) #map nodes to connected components in the graph refnode2component=dict() ctgnode2component=dict() component2refnode=dict() component2ctgnode=dict() refcomponents=[] ctgcomponents=[] ctg2ref=dict() ri=0 ci=0 for nodes in nx.connected_components(G.to_undirected()): nodes=list(nodes) if reffile in G.node[nodes[0]]['offsets']: for node in nodes: assert(reffile in G.node[node]['offsets']) #check the graph is valid refnode2component[node]=ri component2refnode[ri]=node ri+=1 refcomponents.append(nodes) else: for node in nodes: assert(ctgfile in G.node[node]['offsets']) #check the graph is valid ctgnode2component[node]=ci component2ctgnode[ci]=node ci+=1 ctgcomponents.append(nodes) #for each contig, print the length for name in contig2length: print "#%s\t%d"%(name,contig2length[name]) idx.construct() if args.uniq: print "##refname\trefstart\tctgname\tctgstart\tlength\tn\torient" for mem in idx.getmums(args.minlength): refstart=mem[2][0] ctgstart=mem[2][1] rnode=t[refstart].pop() #start position on match to node in graph cnode=t[ctgstart].pop() print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], ctgstart-cnode[0], mem[0], mem[1], 0) else: print "##refname\trefstart\tctgname\tctgstart\tlength\tn\tunique\torient" for mem in idx.getmems(args.minlength): refstart=mem[2][0] ctgstart=mem[2][1] rnode=t[refstart].pop() #start position on match to node in graph cnode=t[ctgstart].pop() print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], ctgstart-cnode[0], mem[0], mem[1], mem[3], 0) if args.rc: logging.debug("Indexing reverse complement...\n") ### index reverse complement if args.sa64: idx=reveallib64.index(sa=args.sa2, lcp=args.lcp2) #enable preconstruction of second SA and LCP array else: idx=reveallib.index(sa=args.sa2, lcp=args.lcp2) #enable preconstruction of second SA and LCP array rcG=nx.DiGraph() t=IntervalTree() idx.addsample(reffile) if args.reference.endswith(".gfa"): read_gfa(args.reference,idx,t,rcG) else: rcG.graph['paths']=set([reffile]) for name,seq in fasta_reader(args.reference): intv=idx.addsequence(seq) intv=Interval(intv[0],intv[1],name) t.add(intv) rcG.add_node(intv,offsets={reffile:0},aligned=0) refseq=seq idx.addsample(ctgfile) if args.contigs.endswith(".gfa"): read_gfa(args.contigs,idx,t,rcG,revcomp=True) else: rcG.graph['paths']=set([ctgfile]) for name,seq in fasta_reader(args.contigs): intv=idx.addsequence(rc(seq)) intv=Interval(intv[0],intv[1],name) t.add(intv) rcG.add_node(intv,offsets={ctgfile:0},aligned=0) idx.construct() if args.uniq: for mem in idx.getmums(args.minlength): refstart=mem[2][0] ctgstart=mem[2][1] rnode=t[refstart].pop() #start position on match to node in graph cnode=t[ctgstart].pop() l=cnode[1]-cnode[0] print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], l-((ctgstart-cnode[0])+mem[0]), mem[0], mem[1], 1) else: for mem in idx.getmems(args.minlength): refstart=mem[2][0] ctgstart=mem[2][1] rnode=t[refstart].pop() #start position on match to node in graph cnode=t[ctgstart].pop() l=cnode[1]-cnode[0] print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], l-((ctgstart-cnode[0])+mem[0]), mem[0], mem[1], mem[3], 1)
def chain_cmd(args): fastas = args.fastas idx = reveallib.index() minn = args.minn tree = IntervalTree() for fasta in fastas: sample = os.path.basename(fasta) idx.addsample(sample) for i, t in enumerate(fasta_reader(fasta)): name, seq = t f, t = idx.addsequence(seq) tree[f:t] = sample if i == 1: logging.error( "Can't handle multi-fasta input. Use single fasta file per sequence." ) sys.exit(1) idx.construct() G = nx.DiGraph() G.graph['paths'] = idx.samples G.graph['path2id'] = dict() G.graph['id2path'] = dict() G.graph['startnodes'] = [] G.graph['endnodes'] = [] for sid, sample in enumerate(G.graph['paths']): G.graph['path2id'][sample] = sid G.graph['id2path'][sid] = sample k = len(idx.samples) T = idx.T istart = tuple( [-1] + [sep for sep in idx.nsep]) #no matches possible at these loci iend = tuple([sep for sep in idx.nsep] + [idx.n - 1]) #loci of sentinels, also no matches possible startcoords = tuple([0] + [sep + 1 for sep in idx.nsep]) G.add_node(istart, l=0) G.add_node(iend, l=0) G.add_edge(istart, iend) G.graph['startnodes'].append(istart) G.graph['endnodes'].append(iend) idc = range(idx.nsamples) stack = [(idx, idc, istart, iend, startcoords, 0, False)] while len(stack) != 0: idx, idc, p1, p2, startcoords, depth, keepedge = stack.pop() subg, pp1, pp2, nodepath = chain(idx, startcoords, args.minlength, depth, args.maxmums, recurse=args.recurse, uniq=True, gcmodel=args.gcmodel, wpen=args.wpen, wscore=args.wscore) if len(nodepath) == 2: #no more chain, output variant sequence localstart = tuple([-1] + [sep for sep in idx.nsep]) localend = tuple([sep - 1 for sep in idx.nsep] + [idx.n - 2]) lengths = tuple([e - s for s, e in zip(localstart, localend)]) outputVariantNodes(G, T, p1, p2, startcoords, lengths) if not keepedge: G.remove_edge(p1, p2) continue #replace the edge (start,end) in G with the chain in subg insertSubgraph(G, p1, p2, subg, pp1, pp2, keepedge) coordpath = list(nodepath) coordpath[0] = tuple([d + 1 for d in nodepath[0]]) nodepath[0] = p1 nodepath[-1] = p2 fromcoord = coordpath[0] fromnode = nodepath[0] l = 0 #for every edge in subg construct idx and add to stack for node, pos in zip(nodepath[1:], coordpath[1:]): seq = [] idc_ = [] keepedge = False for i in idc: f = fromcoord[i] t = pos[i] assert (f >= 0) assert (t >= 0) if f + l < t: seq.append(T[f + l:t]) idc_.append(i) elif f + l == t: keepedge = True else: print "Error overlapping matches", f, l, t sys.exit(1) if len(seq) >= minn and args.recurse == True: idx = reveallib.index() for i, s in enumerate(seq): assert ('$' not in s) idx.addsample(str(i)) idx.addsequence(s) idx.construct() newoffsets = tuple([fromcoord[i] + l for i in idc_]) idc_ = range(len(newoffsets)) stack.append((idx, idc_, fromnode, node, newoffsets, depth + 1, keepedge)) else: varnodes = [fromcoord[i] + l for i in idc_] lengths = [pos[i] - (fromcoord[i] + l) for i in idc_] outputVariantNodes(G, T, fromnode, node, varnodes, lengths) if not keepedge: G.remove_edge(fromnode, node) fromcoord = pos fromnode = node if node != nodepath[-1]: l = subg.node[node]['l'] G.remove_node(istart) G.remove_node(iend) tot = 0 totn = 0 for node, data in G.nodes(data=True): G.node[node]['offsets'] = dict() if isinstance(node, tuple): G.node[node]['seq'] = T[node[0]:node[0] + data['l']] for c in node: intv = list(tree[c])[0] G.node[node]['offsets'][G.graph['path2id'][ intv[2]]] = c - intv[0] else: if 'l' in data: G.node[node]['seq'] = T[node:node + data['l']] intv = list(tree[node])[0] G.node[node]['offsets'][G.graph['path2id'][ intv[2]]] = node - intv[0] if 'aligned' in data: if data['aligned'] == 1: tot += data['l'] totn += 1 print "Aligned", tot, "bases in", totn, "nodes. Nodes total:", G.number_of_nodes( ), "Edges total:", G.number_of_edges() if args.mumplot: plotgraph(G, G.graph['paths'][0], G.graph['paths'][1], interactive=args.interactive) if args.output == None: pref = [] for f in args.fastas: bn = os.path.basename(f) if '.' in bn: pref.append(bn[:bn.find('.')]) else: pref.append(bn) args.output = "_".join(pref) #add paths annotation to edges for sample in G.graph['paths']: sid = G.graph['path2id'][sample] sg = [] for node, data in G.nodes(data=True): if sid in data['offsets']: sg.append(node) subgraph = G.subgraph(sg) topsort = list(nx.topological_sort(subgraph)) pnode = topsort[0] for node in topsort[1:]: if 'paths' in G[pnode][node]: G[pnode][node]['paths'].add(sid) else: G[pnode][node]['paths'] = {sid} pnode = node write_gfa(G, T, nometa=args.nometa, outputfile=args.output + '.gfa')
def align(aobjs,ref=None,minlength=20,minn=2,seedsize=None,threads=0,targetsample=None,maxsamples=None,\ maxmums=10000,wpen=1,wscore=1,sa64=False,pcutoff=1e-8,gcmodel="sumofpairs",maxsize=None,\ trim=True): kwargs = dict( locals() ) #hack the kwargs into a dict so we can pass it to schemes as if it were the argparsed args object class dict2class(object): def __init__(self, d): self.__dict__ = d args = dict2class(kwargs) schemes.args = args #global variables to simplify callbacks from c extension global t, G t = IntervalTree() if sa64: idx = reveallib64.index() else: idx = reveallib.index() G = nx.DiGraph() G.graph['paths'] = [] G.graph['path2id'] = dict() G.graph['id2path'] = dict() G.graph['id2end'] = dict() o = 0 graph = False startnode = uuid.uuid4().hex G.add_node(startnode) endnode = uuid.uuid4().hex G.add_node(endnode) for aobj in aobjs: if isinstance(aobj, tuple): name, seq = aobj idx.addsample(name) intv = idx.addsequence(seq.upper()) if intv[1] - intv[0] > 0: Intv = Interval(intv[0], intv[1]) t.add(Intv) sid = len(G.graph['paths']) G.graph['path2id'][name] = len(G.graph['paths']) G.graph['id2path'][sid] = name G.graph['id2end'][sid] = len(seq) # G.node[endnode]['offsets'][sid]=len(seq) # G.node[startnode]['offsets'][sid]=0 G.graph['paths'].append(name) G.add_node(Intv, offsets={sid: 0}, aligned=0) G.add_edge(startnode, Intv, paths={sid}, ofrom='+', oto='+') G.add_edge(Intv, endnode, paths={sid}, ofrom='+', oto='+') # elif isinstance(aobj,str): # if not os.path.isfile(aobj): # logging.fatal("Not a file, expecting fasta or gfa file.") # return # idx.addsample(os.path.basename(aobj)) # if aobj.endswith(".gfa"): # read_gfa(aobj,idx,t,G,targetsample=targetsample,maxsamples=maxsamples) # graph=True # else: #assume a file in fastaformat # for name,seq in fasta_reader(sample): # intv=idx.addsequence(seq.upper()) # if intv[1]-intv[0]>0: # Intv=Interval(intv[0],intv[1]) # t.add(Intv) # sid=len(G.graph['paths']) # G.graph['path2id'][name]=len(G.graph['paths']) # G.graph['id2path'][sid]=name # G.graph['id2end'][sid]=len(seq) # G.graph['paths'].append(name) # G.add_node(Intv,offsets={sid:0},aligned=0) # G.add_edge(startnode,Intv,paths={sid}) # G.add_edge(endnode,Intv,paths={sid}) if not nx.is_directed_acyclic_graph(G): logging.error("*** Input is not a DAG! Not supported.") return schemes.ts = t schemes.G = G idx.construct() idx.align(schemes.graphmumpicker, graphalign, threads=threads, wpen=wpen, wscore=wscore, minl=minlength, minn=minn) prune_nodes(G, T=idx.T) G.remove_node(startnode) G.remove_node(endnode) return G, idx
def align_genomes(args): logging.info("Loading input...") #global variables to simplify callbacks from c extension global t, G # global reference # reference=args.reference t = IntervalTree() if args.sa64: idx = reveallib64.index(sa=args.sa, lcp=args.lcp, cache=args.cache) else: idx = reveallib.index(sa=args.sa, lcp=args.lcp, cache=args.cache) #G=nx.DiGraph() G = nx.MultiDiGraph() o = 0 schemes.args = args graph = False for i, sample in enumerate(args.inputfiles): if sample.endswith(".gfa"): idx.addsample(os.path.basename(sample)) graph = True logging.info("Reading graph: %s ..." % sample) if i == 0: read_gfa(sample, idx, t, G, minsamples=args.minsamples, maxsamples=args.maxsamples, targetsample=args.targetsample, remap=True) else: read_gfa(sample, idx, t, G, remap=True) else: #consider it to be a fasta file read_fasta(sample, idx, t, G, contigs=args.contigs, toupper=args.toupper) logging.debug("Graph contains the following paths: %s" % G.graph['paths']) logging.debug("Index contains the following samples: %s" % idx.samples) if len(idx.samples) <= 1: logging.fatal( "Specify at least 2 targets to construct alignment. In case of multi-fasta, consider the --nocontigs flag." ) sys.exit(1) if not nx.is_directed_acyclic_graph(G): logging.info("*** Input is not a DAG! ...") for n1, n2, data in G.edges(data=True): assert ('paths' in data) schemes.ts = t schemes.G = G logging.info("Constructing index...") idx.construct() logging.info("Done.") if len(args.inputfiles) == 2 and not graph: logging.info("Constructing pairwise-alignment...") idx.align(schemes.graphmumpicker, graphalign, threads=args.threads, wpen=args.wpen, wscore=args.wscore, minl=args.minlength, minn=args.minn) else: logging.info("Constructing graph-based multi-alignment...") idx.align(schemes.graphmumpicker, graphalign, threads=args.threads, wpen=args.wpen, wscore=args.wscore, minl=args.minlength, minn=args.minn) # from multiprocessing import Process # from Queue import Queue # main=idx #make sure we keep the main ref count, since it has the reference to T # q=Queue() # q.put(idx) # while not q.empty(): # idx=q.get() # if len(args.inputfiles)>2: # multimums=idx.getmultimums(minlength=args.minlength, minn=args.minn) # else: # multimums=idx.mums(args.minlength) # if len(multimums)==0: # continue # ret=schemes.graphmumpicker(multimums,idx) # if ret==None: # continue # else: # splitmum,skipleft,skipright=ret # ret=graphalign(idx,splitmum) # if ret==None: # continue # else: # leading,trailing,matching,rest,merged,newleftnode,newrightnode=ret # ilead,itrail,ipar=idx.splitindex(leading,trailing,matching,rest,merged,newleftnode,newrightnode,skipleft,skipright) # if ilead!=None and ilead.n>1: # q.put(ilead) # if itrail!=None and itrail.n>1: # q.put(itrail) # if ipar!=None and ipar.n>1: # q.put(ipar) return G, idx
def plot(args): vertgaps=[] horzgaps=[] vertgapsizes=[] horzgapsizes=[] ctgoffsets=[] refoffsets=[] qrylength=0 reflength=0 ax = plt.axes() if len(args.fastas)==2: if args.sa64: idx=reveallib64.index() else: idx=reveallib.index() ctgid=0 sample=args.fastas[0] idx.addsample(sample) refoffset=0 for name,seq in fasta_reader(sample): pc=None gapsize=None for i,c in enumerate(seq): if c=='N' and pc!='N': horzgaps.append(i) gapsize=1 elif c=='N' and pc=='N': gapsize+=1 elif c!='N' and pc=='N': horzgapsizes.append(gapsize) pc=c refoffset+=i+2 reflength+=len(seq)+1 refoffsets.append(refoffset) intv=idx.addsequence(seq.upper()) sample=args.fastas[1] idx.addsample(sample) qryoffset=0 for name,seq in fasta_reader(sample): pc=None gapsize=None for i,c in enumerate(seq): if c=='N' and pc!='N': vertgaps.append(qryoffset+i) gapsize=1 elif c=='N' and pc=='N': gapsize+=1 elif c!='N' and pc=='N': vertgapsizes.append(gapsize) pc=c qryoffset+=i+2 qrylength+=len(seq)+1 ctgoffsets.append(qryoffset) intv=idx.addsequence(seq.upper()) qrylength=qrylength-1 idx.construct() print "Extracting mums..." #mmems=[(mem[0],mem[1],mem[2].values(),0) for mem in idx.getmums(args.minlength)] mmems=[(mem[0],mem[1],[sp for gid,sp in mem[2]],0) for mem in idx.getmums(args.minlength)] sep=idx.nsep[0] if args.rc: #get mmems for reverse orientation if args.sa64: idx=reveallib64.index() else: idx=reveallib.index() sample=args.fastas[0] idx.addsample(sample) for name,seq in fasta_reader(sample): idx.addsequence(seq.upper()) sample=args.fastas[1] idx.addsample(sample) qryintvs=[] for name,seq in fasta_reader(sample): intv=idx.addsequence(rc(seq.upper())) qryintvs.append(intv) idx.construct() print "Extracting RC mums..." tmp=idx.getmums(args.minlength) vi=iter(qryintvs) v=vi.next() #tmp=[(m[0],m[1],sorted(m[2].values())) for m in tmp] #make sure start positions are sorted tmp=[(m[0],m[1],sorted([sp for gid,sp in m[2]])) for m in tmp] #make sure start positions are sorted tmp.sort(key=lambda l: l[2][1]) #sort by query pos nmmems=[] for mem in tmp: if mem[2][1]>v[1]: v=vi.next() start,end=v newqstart=end-(mem[2][1]-start)-mem[0] ntup=(mem[0],mem[1],(mem[2][0],newqstart),1) nmmems.append(ntup) mmems+=nmmems print "done." else: logging.fatal("Can only create mumplot for 2 sequences or self plot for 1 sequence.") return start=0 end=sep qend=idx.n del idx if len(mmems)>args.maxmums: logging.info("Too many mums (%d), taking the %d largest."%(len(mmems),args.maxmums)) mmems.sort(key=lambda mem: mem[0],reverse=True) #sort by size mmems=mmems[:args.maxmums] #take the n largest print "Drawing",len(mmems),"matches." for mem in mmems: sps=sorted(mem[2]) l=mem[0] sp1=sps[0] sp2=sps[1]-(sep+1) ep1=sp1+l ep2=sp2+l if sp1>=start and ep1<=end: if mem[3]==0: plt.plot([sp1,ep1],[sp2,ep2],'r-') else: plt.plot([ep1,sp1],[sp2,ep2],'g-') for p in ctgoffsets: plt.axhline(y=p,linewidth=.5,color='black',linestyle='solid') for p in refoffsets: plt.axvline(x=p,linewidth=.5,color='black',linestyle='solid') if args.showgaps: for p,l in zip(horzgaps,horzgapsizes): ax.add_patch( patches.Rectangle( (p, 0), #bottom left l, #width qrylength, #height alpha=.1 ) ) for p,l in zip(vertgaps,vertgapsizes): ax.add_patch( patches.Rectangle( (0, p), #bottom left reflength, #width l, #height alpha=.1 ) ) plt.xlim(start,end) plt.ylim(0,qend-end) plt.title(" vs. ".join(args.fastas)) if len(args.fastas)==2: plt.xlabel(args.fastas[0]) plt.ylabel(args.fastas[1]) else: plt.xlabel(args.fastas[0]) plt.xlabel(args.fastas[0]+"_rc") plt.autoscale(enable=False) if args.xregion!=None: for region in args.xregion.split(","): rstart,rend=region.split(":") #should be rectangle with alfa here plt.axvline(x=int(rstart),linewidth=3,color='b',linestyle='dashed') plt.axvline(x=int(rend),linewidth=3,color='b',linestyle='dashed') if args.yregion!=None: for region in args.yregion.split(","): rstart,rend=region.split(":") #should be rectangle with alfa here plt.axhline(y=int(rstart),linewidth=3,color='b',linestyle='dashed') plt.axhline(y=int(rend),linewidth=3,color='b',linestyle='dashed') if args.interactive: plt.show() else: b1=os.path.basename(args.fastas[0]) b2=os.path.basename(args.fastas[1]) fn1=b1[0:args.fastas[0].rfind('.')] if b1.find('.')!=-1 else b1 fn2=b2[0:args.fastas[1].rfind('.')] if b2.find('.')!=-1 else b2 plt.savefig(fn1+"_"+fn2+"."+args.extension)
def plot(args): import matplotlib if not args.interactive: matplotlib.use('Agg') from matplotlib import pyplot as plt from matplotlib import patches as patches vertgaps = [] horzgaps = [] vertgapsizes = [] horzgapsizes = [] ctgoffsets = [] refoffsets = [] qrylength = 0 reflength = 0 ax = plt.axes() if len(args.fastas) == 2: if args.sa64: idx = reveallib64.index() else: idx = reveallib.index() ctgid = 0 sample = args.fastas[0] idx.addsample(sample) refoffset = 0 for name, seq in fasta_reader(sample): pc = None gapsize = None for i, c in enumerate(seq): if c == 'N' and pc != 'N': horzgaps.append(i) gapsize = 1 elif c == 'N' and pc == 'N': gapsize += 1 elif c != 'N' and pc == 'N': horzgapsizes.append(gapsize) pc = c refoffset += i + 2 reflength += len(seq) + 1 refoffsets.append(refoffset) intv = idx.addsequence(seq.upper()) sample = args.fastas[1] idx.addsample(sample) qryoffset = 0 for name, seq in fasta_reader(sample): pc = None gapsize = None for i, c in enumerate(seq): if c == 'N' and pc != 'N': vertgaps.append(qryoffset + i) gapsize = 1 elif c == 'N' and pc == 'N': gapsize += 1 elif c != 'N' and pc == 'N': vertgapsizes.append(gapsize) pc = c qryoffset += i + 2 qrylength += len(seq) + 1 ctgoffsets.append(qryoffset) intv = idx.addsequence(seq.upper()) qrylength = qrylength - 1 idx.construct() logging.info("Extracting mums...") mmems = idx.getmums(args.minlength) logging.info("Done.") sep = idx.nsep[0] if args.rc: #get mums for reverse orientation idx.construct(rc=True) logging.info("Extracting RC mums...") mmems += idx.getmums(args.minlength) logging.info("Done.") elif len(args.fastas) == 1 and args.fastas[0].endswith(".bed"): bedplot(args) return else: logging.fatal( "Can only create mumplot for 2 sequences or self plot for 1 sequence." ) return start = 0 end = sep qend = idx.n del idx if len(mmems) > args.maxmums: logging.info("Too many mums (%d), taking the %d largest." % (len(mmems), args.maxmums)) mmems.sort(key=lambda mem: mem[0], reverse=True) #sort by size mmems = mmems[:args.maxmums] #take the n largest logging.info("Drawing %d matches." % len(mmems)) xlist, rcxlist = [], [] ylist, rcylist = [], [] for mem in mmems: # sps=sorted(mem[2]) sps = mem[1] l = mem[0] sp1 = sps[0] sp2 = sps[1] - (sep + 1) ep1 = sp1 + l ep2 = sp2 + l if sp1 >= start and ep1 <= end: if mem[2] == 0: xlist.append(sp1) xlist.append(ep1) ylist.append(sp2) ylist.append(ep2) xlist.append(None) ylist.append(None) else: rcxlist.append(ep1) rcxlist.append(sp1) rcylist.append(sp2) rcylist.append(ep2) rcxlist.append(None) rcylist.append(None) plt.plot(xlist, ylist, 'r-') plt.plot(rcxlist, rcylist, 'g-') if args.endpoints: plt.plot(xlist, ylist, 'b*') plt.plot(rcxlist, rcylist, 'y*') for p in ctgoffsets: plt.axhline(y=p, linewidth=.5, color='black', linestyle='solid') for p in refoffsets: plt.axvline(x=p, linewidth=.5, color='black', linestyle='solid') if args.showgaps: for p, l in zip(horzgaps, horzgapsizes): ax.add_patch( patches.Rectangle( (p, 0), #bottom left l, #width qrylength, #height alpha=.1)) for p, l in zip(vertgaps, vertgapsizes): ax.add_patch( patches.Rectangle( (0, p), #bottom left reflength, #width l, #height alpha=.1)) plt.xlim(start, end) plt.ylim(0, qend - end) plt.title(" vs. ".join(args.fastas)) if len(args.fastas) == 2: plt.xlabel(args.fastas[0]) plt.ylabel(args.fastas[1]) else: plt.xlabel(args.fastas[0]) plt.xlabel(args.fastas[0] + "_rc") plt.autoscale(enable=False) if args.xregion != None: xregions = [] for region in args.xregion.split(","): if region.count("-") == 1: rstart, rend = region.split( "-") #should be rectangle with alfa here elif region.count(":") == 1: rstart, rend = region.split( ":") #should be rectangle with alfa here else: logging.fatal( "Invalid region specification, use - : <start>-<end>") sys.exit(1) xregions.append((int(rstart), int(rend))) plt.axvline(x=int(rstart), linewidth=1, color='b', linestyle='dashed') plt.axvline(x=int(rend), linewidth=1, color='b', linestyle='dashed') if args.yregion != None: yregions = [] for region in args.yregion.split(","): if region.count("-") == 1: rstart, rend = region.split( "-") #should be rectangle with alfa here elif region.count(":") == 1: rstart, rend = region.split( ":") #should be rectangle with alfa here else: logging.fatal( "Invalid region specification, use - : <start>-<end>") sys.exit(1) yregions.append((int(rstart), int(rend))) plt.axhline(y=int(rstart), linewidth=1, color='b', linestyle='dashed') plt.axhline(y=int(rend), linewidth=1, color='b', linestyle='dashed') if args.interactive: plt.show() else: b1 = os.path.basename(args.fastas[0]) b2 = os.path.basename(args.fastas[1]) fn1 = b1[:b1.rfind('.')] if b1.find('.') != -1 else b1 fn2 = b2[:b2.rfind('.')] if b2.find('.') != -1 else b2 if args.xregion != None and args.yregion != None: assert (len(xregions) == len(yregions)) if args.flanksize != None: flanksizes = [int(v) for v in args.flanksize.split(",")] else: flanksizes = [0] * len(xregions) for xregion, yregion, flanksize in zip(xregions, yregions, flanksizes): plt.xlim(xregion[0] - flanksize, xregion[1] + flanksize) plt.ylim(yregion[0] - flanksize, yregion[1] + flanksize) plt.savefig(fn1 + "_" + str(xregion[0]) + "-" + str(xregion[1]) + "_" + fn2 + "_" + str(yregion[0]) + "-" + str(yregion[1]) + "." + args.extension) else: plt.savefig(fn1 + "_" + fn2 + "." + args.extension)
def chain_cmd(args): fastas=args.fastas idx=reveallib.index() minn=args.minn tree=IntervalTree() for fasta in fastas: sample=os.path.basename(fasta) idx.addsample(sample) for i,t in enumerate(fasta_reader(fasta)): name,seq=t f,t=idx.addsequence(seq) tree[f:t]=sample if i==1: logging.error("Can't handle multi-fasta input. Use single fasta file per sequence.") sys.exit(1) idx.construct() G=nx.DiGraph() G.graph['paths']=idx.samples G.graph['path2id']=dict() G.graph['id2path']=dict() G.graph['startnodes']=[] G.graph['endnodes']=[] for sid,sample in enumerate(G.graph['paths']): G.graph['path2id'][sample]=sid G.graph['id2path'][sid]=sample k=len(idx.samples) T=idx.T istart=tuple([-1]+[sep for sep in idx.nsep]) #no matches possible at these loci iend=tuple([sep for sep in idx.nsep]+[idx.n-1]) #loci of sentinels, also no matches possible startcoords=tuple([0]+[sep+1 for sep in idx.nsep]) G.add_node(istart,l=0) G.add_node(iend,l=0) G.add_edge(istart,iend) G.graph['startnodes'].append(istart) G.graph['endnodes'].append(iend) idc=range(idx.nsamples) stack=[(idx,idc,istart,iend,startcoords,0,False)] while len(stack)!=0: idx,idc,p1,p2,startcoords,depth,keepedge=stack.pop() subg,pp1,pp2,nodepath=chain(idx,startcoords,args.minlength,depth,args.maxmums,recurse=args.recurse,uniq=True,gcmodel=args.gcmodel,wpen=args.wpen,wscore=args.wscore) if len(nodepath)==2: #no more chain, output variant sequence localstart=tuple([-1]+[sep for sep in idx.nsep]) localend=tuple([sep-1 for sep in idx.nsep]+[idx.n-2]) lengths=tuple([e-s for s,e in zip(localstart,localend)]) outputVariantNodes(G,T,p1,p2,startcoords,lengths) if not keepedge: G.remove_edge(p1,p2) continue #replace the edge (start,end) in G with the chain in subg insertSubgraph(G,p1,p2,subg,pp1,pp2,keepedge) coordpath=list(nodepath) coordpath[0]=tuple([d+1 for d in nodepath[0]]) nodepath[0]=p1 nodepath[-1]=p2 fromcoord=coordpath[0] fromnode=nodepath[0] l=0 #for every edge in subg construct idx and add to stack for node,pos in zip(nodepath[1:],coordpath[1:]): seq=[] idc_=[] keepedge=False for i in idc: f=fromcoord[i] t=pos[i] assert(f>=0) assert(t>=0) if f+l<t: seq.append(T[f+l:t]) idc_.append(i) elif f+l==t: keepedge=True else: print "Error overlapping matches",f,l,t sys.exit(1) if len(seq)>=minn and args.recurse==True: idx=reveallib.index() for i,s in enumerate(seq): assert('$' not in s) idx.addsample(str(i)) idx.addsequence(s) idx.construct() newoffsets=tuple([fromcoord[i]+l for i in idc_]) idc_=range(len(newoffsets)) stack.append((idx, idc_, fromnode, node, newoffsets, depth+1, keepedge)) else: varnodes=[fromcoord[i]+l for i in idc_] lengths=[pos[i]-(fromcoord[i]+l) for i in idc_] outputVariantNodes(G,T,fromnode,node,varnodes,lengths) if not keepedge: G.remove_edge(fromnode,node) fromcoord=pos fromnode=node if node!=nodepath[-1]: l=subg.node[node]['l'] G.remove_node(istart) G.remove_node(iend) tot=0 totn=0 for node,data in G.nodes(data=True): G.node[node]['offsets']=dict() if isinstance(node,tuple): G.node[node]['seq']=T[node[0]:node[0]+data['l']] for c in node: intv=list(tree[c])[0] G.node[node]['offsets'][G.graph['path2id'][intv[2]]]=c-intv[0] else: if 'l' in data: G.node[node]['seq']=T[node:node+data['l']] intv=list(tree[node])[0] G.node[node]['offsets'][G.graph['path2id'][intv[2]]]=node-intv[0] if 'aligned' in data: if data['aligned']==1: tot+=data['l'] totn+=1 print "Aligned",tot,"bases in",totn,"nodes. Nodes total:",G.number_of_nodes(),"Edges total:",G.number_of_edges() if args.mumplot: plotgraph(G, G.graph['paths'][0], G.graph['paths'][1], interactive=args.interactive) if args.output==None: pref=[] for f in args.fastas: bn=os.path.basename(f) if '.' in bn: pref.append(bn[:bn.find('.')]) else: pref.append(bn) args.output="_".join(pref) #add paths annotation to edges for sample in G.graph['paths']: sid=G.graph['path2id'][sample] sg=[] for node,data in G.nodes(data=True): if sid in data['offsets']: sg.append(node) subgraph=G.subgraph(sg) topsort=list(nx.topological_sort(subgraph)) pnode=topsort[0] for node in topsort[1:]: if 'paths' in G[pnode][node]: G[pnode][node]['paths'].add(sid) else: G[pnode][node]['paths']={sid} pnode=node write_gfa(G,T,nometa=args.nometa,outputfile=args.output+'.gfa')