Ejemplo n.º 1
0
def matches(args):

    if args.sa64:
        idx = reveallib64.index(
            sa=args.sa1, lcp=args.lcp1, cache=args.cache
        )  #enable preconstruction of first SA and LCP array
    else:
        idx = reveallib.index(
            sa=args.sa1, lcp=args.lcp1, cache=args.cache
        )  #enable preconstruction of first SA and LCP array

    G = nx.DiGraph()
    G.graph['paths'] = []
    t = IntervalTree()

    reffile = os.path.basename(args.reference)
    ctgfile = os.path.basename(args.contigs)

    ref2length = dict()
    idx.addsample(reffile)
    if args.reference.endswith(".gfa"):
        read_gfa(args.reference, idx, t, G)
    else:
        G.graph['paths'].append(reffile)
        for name, seq in fasta_reader(args.reference):
            ref2length[name] = len(seq)
            intv = idx.addsequence(seq)
            intv = Interval(intv[0], intv[1], name)
            t.add(intv)
            G.add_node(intv, offsets={reffile: 0})

    contig2length = dict()
    idx.addsample(ctgfile)
    if args.contigs.endswith(".gfa"):
        read_gfa(args.contigs, idx, t, G)
    else:
        G.graph['paths'].append(ctgfile)
        for name, seq in fasta_reader(args.contigs):
            contig2length[name] = len(seq)
            intv = idx.addsequence(seq)
            intv = Interval(intv[0], intv[1], name)
            t.add(intv)
            G.add_node(intv, offsets={ctgfile: 0})

    #map nodes to connected components in the graph
    refnode2component = dict()
    ctgnode2component = dict()
    component2refnode = dict()
    component2ctgnode = dict()
    refcomponents = []
    ctgcomponents = []
    ctg2ref = dict()
    ri = 0
    ci = 0
    for nodes in nx.connected_components(G.to_undirected()):
        nodes = list(nodes)
        if reffile in G.node[nodes[0]]['offsets']:
            for node in nodes:
                assert (reffile
                        in G.node[node]['offsets'])  #check the graph is valid
                refnode2component[node] = ri
                component2refnode[ri] = node
            ri += 1
            refcomponents.append(nodes)
        else:
            for node in nodes:
                assert (ctgfile
                        in G.node[node]['offsets'])  #check the graph is valid
                ctgnode2component[node] = ci
                component2ctgnode[ci] = node
            ci += 1
            ctgcomponents.append(nodes)

    #for each contig, print the length
    for name in contig2length:
        print "#%s\t%d" % (name, contig2length[name])

    idx.construct()

    if args.uniq:
        print "##refname\trefstart\tctgname\tctgstart\tlength\tn\torient"
        for mem in idx.getmums(args.minlength):
            refstart = mem[2][0]
            ctgstart = mem[2][1]
            rnode = t[refstart].pop(
            )  #start position on match to node in graph
            cnode = t[ctgstart].pop()
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                rnode[2], refstart - rnode[0], cnode[2], ctgstart - cnode[0],
                mem[0], mem[1], 0)
    else:
        print "##refname\trefstart\tctgname\tctgstart\tlength\tn\tunique\torient"
        for mem in idx.getmems(args.minlength):
            refstart = mem[2][0]
            ctgstart = mem[2][1]
            rnode = t[refstart].pop(
            )  #start position on match to node in graph
            cnode = t[ctgstart].pop()
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                rnode[2], refstart - rnode[0], cnode[2], ctgstart - cnode[0],
                mem[0], mem[1], mem[3], 0)

    if args.rc:

        logging.debug("Indexing reverse complement...\n")

        ### index reverse complement
        if args.sa64:
            idx = reveallib64.index(
                sa=args.sa2, lcp=args.lcp2
            )  #enable preconstruction of second SA and LCP array
        else:
            idx = reveallib.index(
                sa=args.sa2, lcp=args.lcp2
            )  #enable preconstruction of second SA and LCP array

        rcG = nx.DiGraph()
        t = IntervalTree()

        idx.addsample(reffile)
        if args.reference.endswith(".gfa"):
            read_gfa(args.reference, idx, t, rcG)
        else:
            rcG.graph['paths'] = set([reffile])
            for name, seq in fasta_reader(args.reference):
                intv = idx.addsequence(seq)
                intv = Interval(intv[0], intv[1], name)
                t.add(intv)
                rcG.add_node(intv, offsets={reffile: 0}, aligned=0)
                refseq = seq

        idx.addsample(ctgfile)
        if args.contigs.endswith(".gfa"):
            read_gfa(args.contigs, idx, t, rcG, revcomp=True)
        else:
            rcG.graph['paths'] = set([ctgfile])
            for name, seq in fasta_reader(args.contigs):
                intv = idx.addsequence(rc(seq))
                intv = Interval(intv[0], intv[1], name)
                t.add(intv)
                rcG.add_node(intv, offsets={ctgfile: 0}, aligned=0)

        idx.construct()

        if args.uniq:
            for mem in idx.getmums(args.minlength):
                refstart = mem[2][0]
                ctgstart = mem[2][1]
                rnode = t[refstart].pop(
                )  #start position on match to node in graph
                cnode = t[ctgstart].pop()
                l = cnode[1] - cnode[0]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    rnode[2], refstart - rnode[0], cnode[2], l -
                    ((ctgstart - cnode[0]) + mem[0]), mem[0], mem[1], 1)
        else:
            for mem in idx.getmems(args.minlength):
                refstart = mem[2][0]
                ctgstart = mem[2][1]
                rnode = t[refstart].pop(
                )  #start position on match to node in graph
                cnode = t[ctgstart].pop()
                l = cnode[1] - cnode[0]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    rnode[2], refstart - rnode[0], cnode[2], l -
                    ((ctgstart - cnode[0]) + mem[0]), mem[0], mem[1], mem[3],
                    1)
Ejemplo n.º 2
0
def matches(args): 
    
    if args.sa64:
        idx=reveallib64.index(sa=args.sa1, lcp=args.lcp1, cache=args.cache) #enable preconstruction of first SA and LCP array
    else:
        idx=reveallib.index(sa=args.sa1, lcp=args.lcp1, cache=args.cache) #enable preconstruction of first SA and LCP array
    
    G=nx.DiGraph()
    G.graph['paths']=[]
    t=IntervalTree()
    
    reffile=os.path.basename(args.reference)
    ctgfile=os.path.basename(args.contigs)
    
    ref2length=dict()
    idx.addsample(reffile)
    if args.reference.endswith(".gfa"):
        read_gfa(args.reference,idx,t,G)
    else:
        G.graph['paths'].append(reffile)
        for name,seq in fasta_reader(args.reference):
            ref2length[name]=len(seq)
            intv=idx.addsequence(seq)
            intv=Interval(intv[0],intv[1],name)
            t.add(intv)
            G.add_node(intv,offsets={reffile:0})
    
    contig2length=dict()
    idx.addsample(ctgfile)
    if args.contigs.endswith(".gfa"):
        read_gfa(args.contigs,idx,t,G)
    else:
        G.graph['paths'].append(ctgfile)
        for name,seq in fasta_reader(args.contigs):
            contig2length[name]=len(seq)
            intv=idx.addsequence(seq)
            intv=Interval(intv[0],intv[1],name)
            t.add(intv)
            G.add_node(intv,offsets={ctgfile:0})
    
    #map nodes to connected components in the graph
    refnode2component=dict()
    ctgnode2component=dict()
    component2refnode=dict()
    component2ctgnode=dict()
    refcomponents=[]
    ctgcomponents=[]
    ctg2ref=dict()
    ri=0
    ci=0
    for nodes in nx.connected_components(G.to_undirected()):
        nodes=list(nodes)
        if reffile in G.node[nodes[0]]['offsets']:
            for node in nodes:
                assert(reffile in G.node[node]['offsets']) #check the graph is valid
                refnode2component[node]=ri
                component2refnode[ri]=node
            ri+=1
            refcomponents.append(nodes)
        else:
            for node in nodes:
                assert(ctgfile in G.node[node]['offsets']) #check the graph is valid
                ctgnode2component[node]=ci
                component2ctgnode[ci]=node
            ci+=1
            ctgcomponents.append(nodes)
    
    #for each contig, print the length
    for name in contig2length:
        print "#%s\t%d"%(name,contig2length[name])
    
    idx.construct()
    
    if args.uniq:
        print "##refname\trefstart\tctgname\tctgstart\tlength\tn\torient"
        for mem in idx.getmums(args.minlength):
            refstart=mem[2][0]
            ctgstart=mem[2][1]
            rnode=t[refstart].pop() #start position on match to node in graph
            cnode=t[ctgstart].pop()
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], ctgstart-cnode[0], mem[0], mem[1], 0)
    else:
        print "##refname\trefstart\tctgname\tctgstart\tlength\tn\tunique\torient"
        for mem in idx.getmems(args.minlength):
            refstart=mem[2][0]
            ctgstart=mem[2][1]
            rnode=t[refstart].pop() #start position on match to node in graph
            cnode=t[ctgstart].pop()
            print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], ctgstart-cnode[0], mem[0], mem[1], mem[3], 0)
    
    if args.rc:

        logging.debug("Indexing reverse complement...\n")
        
        ### index reverse complement
        if args.sa64:
            idx=reveallib64.index(sa=args.sa2, lcp=args.lcp2) #enable preconstruction of second SA and LCP array
        else:
            idx=reveallib.index(sa=args.sa2, lcp=args.lcp2) #enable preconstruction of second SA and LCP array
        
        rcG=nx.DiGraph()
        t=IntervalTree()
        
        idx.addsample(reffile)
        if args.reference.endswith(".gfa"):
            read_gfa(args.reference,idx,t,rcG)
        else:
            rcG.graph['paths']=set([reffile])
            for name,seq in fasta_reader(args.reference):
                intv=idx.addsequence(seq)
                intv=Interval(intv[0],intv[1],name)
                t.add(intv)
                rcG.add_node(intv,offsets={reffile:0},aligned=0)
                refseq=seq
        
        idx.addsample(ctgfile)
        if args.contigs.endswith(".gfa"):
            read_gfa(args.contigs,idx,t,rcG,revcomp=True)
        else:
            rcG.graph['paths']=set([ctgfile])
            for name,seq in fasta_reader(args.contigs):
                intv=idx.addsequence(rc(seq))
                intv=Interval(intv[0],intv[1],name)
                t.add(intv)
                rcG.add_node(intv,offsets={ctgfile:0},aligned=0)
        
        idx.construct()
        
        if args.uniq:
            for mem in idx.getmums(args.minlength):
                refstart=mem[2][0]
                ctgstart=mem[2][1]
                rnode=t[refstart].pop() #start position on match to node in graph
                cnode=t[ctgstart].pop()
                l=cnode[1]-cnode[0]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], l-((ctgstart-cnode[0])+mem[0]), mem[0], mem[1], 1)
        else:
            for mem in idx.getmems(args.minlength):
                refstart=mem[2][0]
                ctgstart=mem[2][1]
                rnode=t[refstart].pop() #start position on match to node in graph
                cnode=t[ctgstart].pop()
                l=cnode[1]-cnode[0]
                print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (rnode[2], refstart-rnode[0], cnode[2], l-((ctgstart-cnode[0])+mem[0]), mem[0], mem[1], mem[3], 1)
Ejemplo n.º 3
0
def chain_cmd(args):
    fastas = args.fastas
    idx = reveallib.index()
    minn = args.minn

    tree = IntervalTree()

    for fasta in fastas:
        sample = os.path.basename(fasta)
        idx.addsample(sample)
        for i, t in enumerate(fasta_reader(fasta)):
            name, seq = t
            f, t = idx.addsequence(seq)
            tree[f:t] = sample
            if i == 1:
                logging.error(
                    "Can't handle multi-fasta input. Use single fasta file per sequence."
                )
                sys.exit(1)

    idx.construct()

    G = nx.DiGraph()
    G.graph['paths'] = idx.samples
    G.graph['path2id'] = dict()
    G.graph['id2path'] = dict()
    G.graph['startnodes'] = []
    G.graph['endnodes'] = []

    for sid, sample in enumerate(G.graph['paths']):
        G.graph['path2id'][sample] = sid
        G.graph['id2path'][sid] = sample

    k = len(idx.samples)

    T = idx.T

    istart = tuple(
        [-1] + [sep for sep in idx.nsep])  #no matches possible at these loci
    iend = tuple([sep for sep in idx.nsep] +
                 [idx.n - 1])  #loci of sentinels, also no matches possible
    startcoords = tuple([0] + [sep + 1 for sep in idx.nsep])
    G.add_node(istart, l=0)
    G.add_node(iend, l=0)
    G.add_edge(istart, iend)

    G.graph['startnodes'].append(istart)
    G.graph['endnodes'].append(iend)

    idc = range(idx.nsamples)

    stack = [(idx, idc, istart, iend, startcoords, 0, False)]

    while len(stack) != 0:
        idx, idc, p1, p2, startcoords, depth, keepedge = stack.pop()
        subg, pp1, pp2, nodepath = chain(idx,
                                         startcoords,
                                         args.minlength,
                                         depth,
                                         args.maxmums,
                                         recurse=args.recurse,
                                         uniq=True,
                                         gcmodel=args.gcmodel,
                                         wpen=args.wpen,
                                         wscore=args.wscore)

        if len(nodepath) == 2:  #no more chain, output variant sequence
            localstart = tuple([-1] + [sep for sep in idx.nsep])
            localend = tuple([sep - 1 for sep in idx.nsep] + [idx.n - 2])
            lengths = tuple([e - s for s, e in zip(localstart, localend)])
            outputVariantNodes(G, T, p1, p2, startcoords, lengths)
            if not keepedge:
                G.remove_edge(p1, p2)
            continue

        #replace the edge (start,end) in G with the chain in subg
        insertSubgraph(G, p1, p2, subg, pp1, pp2, keepedge)

        coordpath = list(nodepath)
        coordpath[0] = tuple([d + 1 for d in nodepath[0]])
        nodepath[0] = p1
        nodepath[-1] = p2

        fromcoord = coordpath[0]
        fromnode = nodepath[0]
        l = 0

        #for every edge in subg construct idx and add to stack
        for node, pos in zip(nodepath[1:], coordpath[1:]):
            seq = []
            idc_ = []
            keepedge = False

            for i in idc:
                f = fromcoord[i]
                t = pos[i]
                assert (f >= 0)
                assert (t >= 0)
                if f + l < t:
                    seq.append(T[f + l:t])
                    idc_.append(i)
                elif f + l == t:
                    keepedge = True
                else:
                    print "Error overlapping matches", f, l, t
                    sys.exit(1)

            if len(seq) >= minn and args.recurse == True:
                idx = reveallib.index()
                for i, s in enumerate(seq):
                    assert ('$' not in s)
                    idx.addsample(str(i))
                    idx.addsequence(s)
                idx.construct()

                newoffsets = tuple([fromcoord[i] + l for i in idc_])
                idc_ = range(len(newoffsets))
                stack.append((idx, idc_, fromnode, node, newoffsets, depth + 1,
                              keepedge))
            else:
                varnodes = [fromcoord[i] + l for i in idc_]
                lengths = [pos[i] - (fromcoord[i] + l) for i in idc_]
                outputVariantNodes(G, T, fromnode, node, varnodes, lengths)
                if not keepedge:
                    G.remove_edge(fromnode, node)

            fromcoord = pos
            fromnode = node

            if node != nodepath[-1]:
                l = subg.node[node]['l']

    G.remove_node(istart)
    G.remove_node(iend)

    tot = 0
    totn = 0
    for node, data in G.nodes(data=True):
        G.node[node]['offsets'] = dict()

        if isinstance(node, tuple):
            G.node[node]['seq'] = T[node[0]:node[0] + data['l']]
            for c in node:
                intv = list(tree[c])[0]
                G.node[node]['offsets'][G.graph['path2id'][
                    intv[2]]] = c - intv[0]
        else:
            if 'l' in data:
                G.node[node]['seq'] = T[node:node + data['l']]
            intv = list(tree[node])[0]
            G.node[node]['offsets'][G.graph['path2id'][
                intv[2]]] = node - intv[0]

        if 'aligned' in data:
            if data['aligned'] == 1:
                tot += data['l']
                totn += 1

    print "Aligned", tot, "bases in", totn, "nodes. Nodes total:", G.number_of_nodes(
    ), "Edges total:", G.number_of_edges()

    if args.mumplot:
        plotgraph(G,
                  G.graph['paths'][0],
                  G.graph['paths'][1],
                  interactive=args.interactive)

    if args.output == None:
        pref = []
        for f in args.fastas:
            bn = os.path.basename(f)
            if '.' in bn:
                pref.append(bn[:bn.find('.')])
            else:
                pref.append(bn)
        args.output = "_".join(pref)

    #add paths annotation to edges
    for sample in G.graph['paths']:
        sid = G.graph['path2id'][sample]
        sg = []
        for node, data in G.nodes(data=True):
            if sid in data['offsets']:
                sg.append(node)
        subgraph = G.subgraph(sg)
        topsort = list(nx.topological_sort(subgraph))
        pnode = topsort[0]
        for node in topsort[1:]:
            if 'paths' in G[pnode][node]:
                G[pnode][node]['paths'].add(sid)
            else:
                G[pnode][node]['paths'] = {sid}
            pnode = node

    write_gfa(G, T, nometa=args.nometa, outputfile=args.output + '.gfa')
Ejemplo n.º 4
0
def align(aobjs,ref=None,minlength=20,minn=2,seedsize=None,threads=0,targetsample=None,maxsamples=None,\
                maxmums=10000,wpen=1,wscore=1,sa64=False,pcutoff=1e-8,gcmodel="sumofpairs",maxsize=None,\
                trim=True):

    kwargs = dict(
        locals()
    )  #hack the kwargs into a dict so we can pass it to schemes as if it were the argparsed args object

    class dict2class(object):
        def __init__(self, d):
            self.__dict__ = d

    args = dict2class(kwargs)
    schemes.args = args

    #global variables to simplify callbacks from c extension
    global t, G

    t = IntervalTree()

    if sa64:
        idx = reveallib64.index()
    else:
        idx = reveallib.index()

    G = nx.DiGraph()

    G.graph['paths'] = []
    G.graph['path2id'] = dict()
    G.graph['id2path'] = dict()
    G.graph['id2end'] = dict()
    o = 0

    graph = False

    startnode = uuid.uuid4().hex
    G.add_node(startnode)
    endnode = uuid.uuid4().hex
    G.add_node(endnode)

    for aobj in aobjs:
        if isinstance(aobj, tuple):
            name, seq = aobj
            idx.addsample(name)
            intv = idx.addsequence(seq.upper())
            if intv[1] - intv[0] > 0:
                Intv = Interval(intv[0], intv[1])
                t.add(Intv)
                sid = len(G.graph['paths'])
                G.graph['path2id'][name] = len(G.graph['paths'])
                G.graph['id2path'][sid] = name
                G.graph['id2end'][sid] = len(seq)

                # G.node[endnode]['offsets'][sid]=len(seq)
                # G.node[startnode]['offsets'][sid]=0

                G.graph['paths'].append(name)
                G.add_node(Intv, offsets={sid: 0}, aligned=0)
                G.add_edge(startnode, Intv, paths={sid}, ofrom='+', oto='+')
                G.add_edge(Intv, endnode, paths={sid}, ofrom='+', oto='+')

        # elif isinstance(aobj,str):
        #     if not os.path.isfile(aobj):
        #         logging.fatal("Not a file, expecting fasta or gfa file.")
        #         return
        #     idx.addsample(os.path.basename(aobj))
        #     if aobj.endswith(".gfa"):
        #         read_gfa(aobj,idx,t,G,targetsample=targetsample,maxsamples=maxsamples)
        #         graph=True
        #     else: #assume a file in fastaformat
        #         for name,seq in fasta_reader(sample):
        #             intv=idx.addsequence(seq.upper())
        #             if intv[1]-intv[0]>0:
        #                 Intv=Interval(intv[0],intv[1])
        #                 t.add(Intv)
        #                 sid=len(G.graph['paths'])
        #                 G.graph['path2id'][name]=len(G.graph['paths'])
        #                 G.graph['id2path'][sid]=name
        #                 G.graph['id2end'][sid]=len(seq)
        #                 G.graph['paths'].append(name)
        #                 G.add_node(Intv,offsets={sid:0},aligned=0)
        #                 G.add_edge(startnode,Intv,paths={sid})
        #                 G.add_edge(endnode,Intv,paths={sid})

    if not nx.is_directed_acyclic_graph(G):
        logging.error("*** Input is not a DAG! Not supported.")
        return

    schemes.ts = t
    schemes.G = G

    idx.construct()

    idx.align(schemes.graphmumpicker,
              graphalign,
              threads=threads,
              wpen=wpen,
              wscore=wscore,
              minl=minlength,
              minn=minn)

    prune_nodes(G, T=idx.T)

    G.remove_node(startnode)
    G.remove_node(endnode)

    return G, idx
Ejemplo n.º 5
0
def align_genomes(args):
    logging.info("Loading input...")
    #global variables to simplify callbacks from c extension
    global t, G

    # global reference
    # reference=args.reference

    t = IntervalTree()

    if args.sa64:
        idx = reveallib64.index(sa=args.sa, lcp=args.lcp, cache=args.cache)
    else:
        idx = reveallib.index(sa=args.sa, lcp=args.lcp, cache=args.cache)

    #G=nx.DiGraph()
    G = nx.MultiDiGraph()

    o = 0
    schemes.args = args

    graph = False

    for i, sample in enumerate(args.inputfiles):

        if sample.endswith(".gfa"):
            idx.addsample(os.path.basename(sample))
            graph = True

            logging.info("Reading graph: %s ..." % sample)
            if i == 0:
                read_gfa(sample,
                         idx,
                         t,
                         G,
                         minsamples=args.minsamples,
                         maxsamples=args.maxsamples,
                         targetsample=args.targetsample,
                         remap=True)
            else:
                read_gfa(sample, idx, t, G, remap=True)

        else:  #consider it to be a fasta file
            read_fasta(sample,
                       idx,
                       t,
                       G,
                       contigs=args.contigs,
                       toupper=args.toupper)

    logging.debug("Graph contains the following paths: %s" % G.graph['paths'])

    logging.debug("Index contains the following samples: %s" % idx.samples)

    if len(idx.samples) <= 1:
        logging.fatal(
            "Specify at least 2 targets to construct alignment. In case of multi-fasta, consider the --nocontigs flag."
        )
        sys.exit(1)

    if not nx.is_directed_acyclic_graph(G):
        logging.info("*** Input is not a DAG! ...")

    for n1, n2, data in G.edges(data=True):
        assert ('paths' in data)

    schemes.ts = t
    schemes.G = G

    logging.info("Constructing index...")
    idx.construct()

    logging.info("Done.")

    if len(args.inputfiles) == 2 and not graph:
        logging.info("Constructing pairwise-alignment...")
        idx.align(schemes.graphmumpicker,
                  graphalign,
                  threads=args.threads,
                  wpen=args.wpen,
                  wscore=args.wscore,
                  minl=args.minlength,
                  minn=args.minn)
    else:
        logging.info("Constructing graph-based multi-alignment...")
        idx.align(schemes.graphmumpicker,
                  graphalign,
                  threads=args.threads,
                  wpen=args.wpen,
                  wscore=args.wscore,
                  minl=args.minlength,
                  minn=args.minn)

    # from multiprocessing import Process
    # from Queue import Queue
    # main=idx #make sure we keep the main ref count, since it has the reference to T
    # q=Queue()
    # q.put(idx)
    # while not q.empty():
    #     idx=q.get()
    #     if len(args.inputfiles)>2:
    #         multimums=idx.getmultimums(minlength=args.minlength, minn=args.minn)
    #     else:
    #         multimums=idx.mums(args.minlength)
    #     if len(multimums)==0:
    #         continue
    #     ret=schemes.graphmumpicker(multimums,idx)
    #     if ret==None:
    #         continue
    #     else:
    #         splitmum,skipleft,skipright=ret
    #     ret=graphalign(idx,splitmum)
    #     if ret==None:
    #         continue
    #     else:
    #         leading,trailing,matching,rest,merged,newleftnode,newrightnode=ret
    #     ilead,itrail,ipar=idx.splitindex(leading,trailing,matching,rest,merged,newleftnode,newrightnode,skipleft,skipright)
    #     if ilead!=None and ilead.n>1:
    #         q.put(ilead)
    #     if itrail!=None and itrail.n>1:
    #         q.put(itrail)
    #     if ipar!=None and ipar.n>1:
    #         q.put(ipar)

    return G, idx
Ejemplo n.º 6
0
def plot(args):
    vertgaps=[]
    horzgaps=[]
    vertgapsizes=[]
    horzgapsizes=[]
    ctgoffsets=[]
    refoffsets=[]
    qrylength=0
    reflength=0
    ax = plt.axes()
    
    if len(args.fastas)==2:
        if args.sa64:
            idx=reveallib64.index()
        else:
            idx=reveallib.index()
        
        ctgid=0
        
        sample=args.fastas[0]
        idx.addsample(sample)
        refoffset=0
        for name,seq in fasta_reader(sample):
            pc=None
            gapsize=None
            for i,c in enumerate(seq):
                if c=='N' and pc!='N':
                    horzgaps.append(i)
                    gapsize=1
                elif c=='N' and pc=='N':
                    gapsize+=1
                elif c!='N' and pc=='N':
                    horzgapsizes.append(gapsize)
                pc=c
            refoffset+=i+2
            reflength+=len(seq)+1
            refoffsets.append(refoffset)
            intv=idx.addsequence(seq.upper())
        
        sample=args.fastas[1]
        idx.addsample(sample)
        qryoffset=0
        for name,seq in fasta_reader(sample):
            pc=None
            gapsize=None
            for i,c in enumerate(seq):
                if c=='N' and pc!='N':
                    vertgaps.append(qryoffset+i)
                    gapsize=1
                elif c=='N' and pc=='N':
                    gapsize+=1
                elif c!='N' and pc=='N':
                    vertgapsizes.append(gapsize)
                pc=c
            qryoffset+=i+2
            qrylength+=len(seq)+1
            ctgoffsets.append(qryoffset)
            intv=idx.addsequence(seq.upper())
        
        qrylength=qrylength-1
        idx.construct()
        
        print "Extracting mums..."
        #mmems=[(mem[0],mem[1],mem[2].values(),0) for mem in idx.getmums(args.minlength)]
        mmems=[(mem[0],mem[1],[sp for gid,sp in mem[2]],0) for mem in idx.getmums(args.minlength)]
        
        sep=idx.nsep[0]

        if args.rc:
            #get mmems for reverse orientation
            if args.sa64:
                idx=reveallib64.index()
            else:
                idx=reveallib.index()
            
            sample=args.fastas[0]
            idx.addsample(sample)
            for name,seq in fasta_reader(sample):
                idx.addsequence(seq.upper())
            
            sample=args.fastas[1]
            idx.addsample(sample)

            qryintvs=[]
            for name,seq in fasta_reader(sample):
                intv=idx.addsequence(rc(seq.upper()))
                qryintvs.append(intv)
            
            idx.construct()
            
            print "Extracting RC mums..."
            tmp=idx.getmums(args.minlength)
            
            vi=iter(qryintvs)
            v=vi.next()
            
            #tmp=[(m[0],m[1],sorted(m[2].values())) for m in tmp] #make sure start positions are sorted
            tmp=[(m[0],m[1],sorted([sp for gid,sp in m[2]])) for m in tmp] #make sure start positions are sorted
            tmp.sort(key=lambda l: l[2][1]) #sort by query pos
            
            nmmems=[]
            for mem in tmp:
                if mem[2][1]>v[1]:
                    v=vi.next()
                start,end=v
                newqstart=end-(mem[2][1]-start)-mem[0]
                ntup=(mem[0],mem[1],(mem[2][0],newqstart),1)
                nmmems.append(ntup)
            
            mmems+=nmmems
            
            print "done."
     
    else:
        logging.fatal("Can only create mumplot for 2 sequences or self plot for 1 sequence.")
        return
    
    start=0
    end=sep
    qend=idx.n

    del idx
    
    if len(mmems)>args.maxmums:
        logging.info("Too many mums (%d), taking the %d largest."%(len(mmems),args.maxmums))
        mmems.sort(key=lambda mem: mem[0],reverse=True) #sort by size
        mmems=mmems[:args.maxmums] #take the n largest
    
    print "Drawing",len(mmems),"matches."
    
    for mem in mmems:
        sps=sorted(mem[2])
        l=mem[0]
        sp1=sps[0]
        sp2=sps[1]-(sep+1)
        ep1=sp1+l
        ep2=sp2+l
        
        if sp1>=start and ep1<=end:
            if mem[3]==0:
                plt.plot([sp1,ep1],[sp2,ep2],'r-')
            else:
                plt.plot([ep1,sp1],[sp2,ep2],'g-')
    
    for p in ctgoffsets:
        plt.axhline(y=p,linewidth=.5,color='black',linestyle='solid')
    
    for p in refoffsets:
        plt.axvline(x=p,linewidth=.5,color='black',linestyle='solid')
    
    if args.showgaps:
        for p,l in zip(horzgaps,horzgapsizes):
            ax.add_patch(
                patches.Rectangle(
                    (p, 0), #bottom left
                    l, #width
                    qrylength, #height
                    alpha=.1
                )
            )
         
        for p,l in zip(vertgaps,vertgapsizes):
            ax.add_patch(
                patches.Rectangle(
                    (0, p), #bottom left
                    reflength, #width
                    l, #height
                    alpha=.1
                )
            )
        
    plt.xlim(start,end)
    plt.ylim(0,qend-end)
    plt.title(" vs. ".join(args.fastas))
    if len(args.fastas)==2:
        plt.xlabel(args.fastas[0])
        plt.ylabel(args.fastas[1])
    else:
        plt.xlabel(args.fastas[0])
        plt.xlabel(args.fastas[0]+"_rc")
    plt.autoscale(enable=False)
    
    if args.xregion!=None:
        for region in args.xregion.split(","):
            rstart,rend=region.split(":") #should be rectangle with alfa here
            plt.axvline(x=int(rstart),linewidth=3,color='b',linestyle='dashed')
            plt.axvline(x=int(rend),linewidth=3,color='b',linestyle='dashed')

    if args.yregion!=None:
        for region in args.yregion.split(","):
            rstart,rend=region.split(":") #should be rectangle with alfa here
            plt.axhline(y=int(rstart),linewidth=3,color='b',linestyle='dashed')
            plt.axhline(y=int(rend),linewidth=3,color='b',linestyle='dashed')

    if args.interactive:
        plt.show()
    else:
        b1=os.path.basename(args.fastas[0])
        b2=os.path.basename(args.fastas[1])
        fn1=b1[0:args.fastas[0].rfind('.')] if b1.find('.')!=-1 else b1
        fn2=b2[0:args.fastas[1].rfind('.')] if b2.find('.')!=-1 else b2
        plt.savefig(fn1+"_"+fn2+"."+args.extension)
Ejemplo n.º 7
0
def plot(args):

    import matplotlib

    if not args.interactive:
        matplotlib.use('Agg')

    from matplotlib import pyplot as plt
    from matplotlib import patches as patches

    vertgaps = []
    horzgaps = []
    vertgapsizes = []
    horzgapsizes = []
    ctgoffsets = []
    refoffsets = []
    qrylength = 0
    reflength = 0
    ax = plt.axes()

    if len(args.fastas) == 2:
        if args.sa64:
            idx = reveallib64.index()
        else:
            idx = reveallib.index()

        ctgid = 0

        sample = args.fastas[0]
        idx.addsample(sample)
        refoffset = 0
        for name, seq in fasta_reader(sample):
            pc = None
            gapsize = None
            for i, c in enumerate(seq):
                if c == 'N' and pc != 'N':
                    horzgaps.append(i)
                    gapsize = 1
                elif c == 'N' and pc == 'N':
                    gapsize += 1
                elif c != 'N' and pc == 'N':
                    horzgapsizes.append(gapsize)
                pc = c
            refoffset += i + 2
            reflength += len(seq) + 1
            refoffsets.append(refoffset)
            intv = idx.addsequence(seq.upper())

        sample = args.fastas[1]
        idx.addsample(sample)
        qryoffset = 0
        for name, seq in fasta_reader(sample):
            pc = None
            gapsize = None
            for i, c in enumerate(seq):
                if c == 'N' and pc != 'N':
                    vertgaps.append(qryoffset + i)
                    gapsize = 1
                elif c == 'N' and pc == 'N':
                    gapsize += 1
                elif c != 'N' and pc == 'N':
                    vertgapsizes.append(gapsize)
                pc = c
            qryoffset += i + 2
            qrylength += len(seq) + 1
            ctgoffsets.append(qryoffset)
            intv = idx.addsequence(seq.upper())

        qrylength = qrylength - 1
        idx.construct()

        logging.info("Extracting mums...")
        mmems = idx.getmums(args.minlength)
        logging.info("Done.")

        sep = idx.nsep[0]

        if args.rc:

            #get mums for reverse orientation
            idx.construct(rc=True)

            logging.info("Extracting RC mums...")
            mmems += idx.getmums(args.minlength)
            logging.info("Done.")

    elif len(args.fastas) == 1 and args.fastas[0].endswith(".bed"):
        bedplot(args)
        return
    else:
        logging.fatal(
            "Can only create mumplot for 2 sequences or self plot for 1 sequence."
        )
        return

    start = 0
    end = sep
    qend = idx.n

    del idx

    if len(mmems) > args.maxmums:
        logging.info("Too many mums (%d), taking the %d largest." %
                     (len(mmems), args.maxmums))
        mmems.sort(key=lambda mem: mem[0], reverse=True)  #sort by size
        mmems = mmems[:args.maxmums]  #take the n largest

    logging.info("Drawing %d matches." % len(mmems))

    xlist, rcxlist = [], []
    ylist, rcylist = [], []

    for mem in mmems:
        # sps=sorted(mem[2])
        sps = mem[1]
        l = mem[0]

        sp1 = sps[0]
        sp2 = sps[1] - (sep + 1)
        ep1 = sp1 + l
        ep2 = sp2 + l

        if sp1 >= start and ep1 <= end:

            if mem[2] == 0:
                xlist.append(sp1)
                xlist.append(ep1)
                ylist.append(sp2)
                ylist.append(ep2)
                xlist.append(None)
                ylist.append(None)
            else:
                rcxlist.append(ep1)
                rcxlist.append(sp1)
                rcylist.append(sp2)
                rcylist.append(ep2)
                rcxlist.append(None)
                rcylist.append(None)

    plt.plot(xlist, ylist, 'r-')
    plt.plot(rcxlist, rcylist, 'g-')

    if args.endpoints:
        plt.plot(xlist, ylist, 'b*')
        plt.plot(rcxlist, rcylist, 'y*')

    for p in ctgoffsets:
        plt.axhline(y=p, linewidth=.5, color='black', linestyle='solid')

    for p in refoffsets:
        plt.axvline(x=p, linewidth=.5, color='black', linestyle='solid')

    if args.showgaps:
        for p, l in zip(horzgaps, horzgapsizes):
            ax.add_patch(
                patches.Rectangle(
                    (p, 0),  #bottom left
                    l,  #width
                    qrylength,  #height
                    alpha=.1))

        for p, l in zip(vertgaps, vertgapsizes):
            ax.add_patch(
                patches.Rectangle(
                    (0, p),  #bottom left
                    reflength,  #width
                    l,  #height
                    alpha=.1))

    plt.xlim(start, end)
    plt.ylim(0, qend - end)
    plt.title(" vs. ".join(args.fastas))
    if len(args.fastas) == 2:
        plt.xlabel(args.fastas[0])
        plt.ylabel(args.fastas[1])
    else:
        plt.xlabel(args.fastas[0])
        plt.xlabel(args.fastas[0] + "_rc")
    plt.autoscale(enable=False)

    if args.xregion != None:
        xregions = []

        for region in args.xregion.split(","):

            if region.count("-") == 1:
                rstart, rend = region.split(
                    "-")  #should be rectangle with alfa here
            elif region.count(":") == 1:
                rstart, rend = region.split(
                    ":")  #should be rectangle with alfa here
            else:
                logging.fatal(
                    "Invalid region specification, use - : <start>-<end>")
                sys.exit(1)

            xregions.append((int(rstart), int(rend)))
            plt.axvline(x=int(rstart),
                        linewidth=1,
                        color='b',
                        linestyle='dashed')
            plt.axvline(x=int(rend),
                        linewidth=1,
                        color='b',
                        linestyle='dashed')

    if args.yregion != None:
        yregions = []

        for region in args.yregion.split(","):

            if region.count("-") == 1:
                rstart, rend = region.split(
                    "-")  #should be rectangle with alfa here
            elif region.count(":") == 1:
                rstart, rend = region.split(
                    ":")  #should be rectangle with alfa here
            else:
                logging.fatal(
                    "Invalid region specification, use - : <start>-<end>")
                sys.exit(1)

            yregions.append((int(rstart), int(rend)))
            plt.axhline(y=int(rstart),
                        linewidth=1,
                        color='b',
                        linestyle='dashed')
            plt.axhline(y=int(rend),
                        linewidth=1,
                        color='b',
                        linestyle='dashed')

    if args.interactive:
        plt.show()
    else:
        b1 = os.path.basename(args.fastas[0])
        b2 = os.path.basename(args.fastas[1])

        fn1 = b1[:b1.rfind('.')] if b1.find('.') != -1 else b1
        fn2 = b2[:b2.rfind('.')] if b2.find('.') != -1 else b2

        if args.xregion != None and args.yregion != None:
            assert (len(xregions) == len(yregions))

            if args.flanksize != None:
                flanksizes = [int(v) for v in args.flanksize.split(",")]
            else:
                flanksizes = [0] * len(xregions)

            for xregion, yregion, flanksize in zip(xregions, yregions,
                                                   flanksizes):
                plt.xlim(xregion[0] - flanksize, xregion[1] + flanksize)
                plt.ylim(yregion[0] - flanksize, yregion[1] + flanksize)
                plt.savefig(fn1 + "_" + str(xregion[0]) + "-" +
                            str(xregion[1]) + "_" + fn2 + "_" +
                            str(yregion[0]) + "-" + str(yregion[1]) + "." +
                            args.extension)
        else:
            plt.savefig(fn1 + "_" + fn2 + "." + args.extension)
Ejemplo n.º 8
0
def chain_cmd(args):
    fastas=args.fastas
    idx=reveallib.index()
    minn=args.minn
    
    tree=IntervalTree()
    
    for fasta in fastas:
        sample=os.path.basename(fasta)
        idx.addsample(sample)
        for i,t in enumerate(fasta_reader(fasta)):
            name,seq=t
            f,t=idx.addsequence(seq)
            tree[f:t]=sample
            if i==1:
                logging.error("Can't handle multi-fasta input. Use single fasta file per sequence.")
                sys.exit(1)
    
    idx.construct()
    
    G=nx.DiGraph()
    G.graph['paths']=idx.samples
    G.graph['path2id']=dict()
    G.graph['id2path']=dict()
    G.graph['startnodes']=[]
    G.graph['endnodes']=[]

    for sid,sample in enumerate(G.graph['paths']):
        G.graph['path2id'][sample]=sid
        G.graph['id2path'][sid]=sample
    
    k=len(idx.samples)
    
    T=idx.T
    
    istart=tuple([-1]+[sep for sep in idx.nsep]) #no matches possible at these loci
    iend=tuple([sep for sep in idx.nsep]+[idx.n-1]) #loci of sentinels, also no matches possible
    startcoords=tuple([0]+[sep+1 for sep in idx.nsep])
    G.add_node(istart,l=0)
    G.add_node(iend,l=0)
    G.add_edge(istart,iend)
    
    G.graph['startnodes'].append(istart)
    G.graph['endnodes'].append(iend)
    
    idc=range(idx.nsamples)

    stack=[(idx,idc,istart,iend,startcoords,0,False)]
    
    while len(stack)!=0:
        idx,idc,p1,p2,startcoords,depth,keepedge=stack.pop()
        subg,pp1,pp2,nodepath=chain(idx,startcoords,args.minlength,depth,args.maxmums,recurse=args.recurse,uniq=True,gcmodel=args.gcmodel,wpen=args.wpen,wscore=args.wscore)

        if len(nodepath)==2: #no more chain, output variant sequence
            localstart=tuple([-1]+[sep for sep in idx.nsep])
            localend=tuple([sep-1 for sep in idx.nsep]+[idx.n-2])
            lengths=tuple([e-s for s,e in zip(localstart,localend)])
            outputVariantNodes(G,T,p1,p2,startcoords,lengths)
            if not keepedge:
                G.remove_edge(p1,p2)
            continue
         
        #replace the edge (start,end) in G with the chain in subg
        insertSubgraph(G,p1,p2,subg,pp1,pp2,keepedge)
        
        coordpath=list(nodepath)
        coordpath[0]=tuple([d+1 for d in nodepath[0]])
        nodepath[0]=p1
        nodepath[-1]=p2

        fromcoord=coordpath[0]
        fromnode=nodepath[0]
        l=0
        
        #for every edge in subg construct idx and add to stack
        for node,pos in zip(nodepath[1:],coordpath[1:]):
            seq=[]
            idc_=[]
            keepedge=False

            for i in idc:
                f=fromcoord[i]
                t=pos[i]
                assert(f>=0)
                assert(t>=0)
                if f+l<t:
                    seq.append(T[f+l:t])
                    idc_.append(i)
                elif f+l==t:
                    keepedge=True
                else:
                    print "Error overlapping matches",f,l,t
                    sys.exit(1)
            
            if len(seq)>=minn and args.recurse==True:
                idx=reveallib.index()
                for i,s in enumerate(seq):
                    assert('$' not in s)
                    idx.addsample(str(i))
                    idx.addsequence(s)
                idx.construct()
                
                newoffsets=tuple([fromcoord[i]+l for i in idc_])
                idc_=range(len(newoffsets))
                stack.append((idx, idc_, fromnode, node, newoffsets, depth+1, keepedge))
            else:
                varnodes=[fromcoord[i]+l for i in idc_]
                lengths=[pos[i]-(fromcoord[i]+l) for i in idc_]
                outputVariantNodes(G,T,fromnode,node,varnodes,lengths)
                if not keepedge:
                    G.remove_edge(fromnode,node)
            
            fromcoord=pos
            fromnode=node
            
            if node!=nodepath[-1]:
                l=subg.node[node]['l']
    
    G.remove_node(istart)
    G.remove_node(iend)
    
    tot=0
    totn=0
    for node,data in G.nodes(data=True):
        G.node[node]['offsets']=dict()
        
        if isinstance(node,tuple):
            G.node[node]['seq']=T[node[0]:node[0]+data['l']]
            for c in node:
                intv=list(tree[c])[0]
                G.node[node]['offsets'][G.graph['path2id'][intv[2]]]=c-intv[0]
        else:
            if 'l' in data:
                G.node[node]['seq']=T[node:node+data['l']]
            intv=list(tree[node])[0]
            G.node[node]['offsets'][G.graph['path2id'][intv[2]]]=node-intv[0]
        
        if 'aligned' in data:
            if data['aligned']==1:
                tot+=data['l']
                totn+=1

    print "Aligned",tot,"bases in",totn,"nodes. Nodes total:",G.number_of_nodes(),"Edges total:",G.number_of_edges()
    
    if args.mumplot:
        plotgraph(G, G.graph['paths'][0], G.graph['paths'][1], interactive=args.interactive)
    
    if args.output==None:
        pref=[]
        for f in args.fastas:
            bn=os.path.basename(f)
            if '.' in bn:
                pref.append(bn[:bn.find('.')])
            else:
                pref.append(bn)
        args.output="_".join(pref)
    
    #add paths annotation to edges
    for sample in G.graph['paths']:
        sid=G.graph['path2id'][sample]
        sg=[]
        for node,data in G.nodes(data=True):
            if sid in data['offsets']:
                sg.append(node)
        subgraph=G.subgraph(sg)
        topsort=list(nx.topological_sort(subgraph))
        pnode=topsort[0]
        for node in topsort[1:]:
            if 'paths' in G[pnode][node]:
                G[pnode][node]['paths'].add(sid)
            else:
                G[pnode][node]['paths']={sid}
            pnode=node

    write_gfa(G,T,nometa=args.nometa,outputfile=args.output+'.gfa')