コード例 #1
0
def unzip(args):
    if not args.graph[0].endswith(".gfa") and not args.graph[0].endswith(
            ".gfa.gz"):
        logging.fatal("Invalid gfa file.")
        return

    # G=nx.MultiDiGraph()
    G = nx.DiGraph()
    utils.read_gfa(args.graph[0], None, None, G, remap=False)

    if args.source == None and args.sink == None:
        unzip_graph(G, args, minunzip=args.minunzip)
    else:
        b = bubbles.Bubble(G, args.source, args.sink)
        unzip_bubble(G,
                     b,
                     minunzip=args.minunzip,
                     idoffset=max([n
                                   for n in G.nodes() if type(n) == int]) + 1)

    if args.output == None:
        of = os.path.splitext(args.graph[0])[0] + ".unzipped.gfa"
    else:
        of = args.output + ".gfa"

    utils.write_gfa(G, None, outputfile=of)
コード例 #2
0
def chop_cmd(args):
    if not args.graph[0].endswith(".gfa"):
        logging.fatal("Invalid gfa file.")
        return

    G=nx.DiGraph()
    utils.read_gfa(args.graph[0], None, None, G, remap=False)
    
    # assert(len(G.edges())==0)
    if args.output==None:
        fof=os.path.splitext(args.graph[0])[0]+".chopped.fasta"
        gof=os.path.splitext(args.graph[0])[0]+".chopped.gfa"
    else:
        fof=args.output+".fasta"
        gof=args.output+".gfa"

    if args.check:
        Gorg=G.copy()

    chop(G,k=args.k,extend=args.extend)
    
    logging.debug("Merging node sequence...")
    for node in G.nodes():
        if type(node)==str: #skip start/end nodes
            continue
        G.node[node]['seq']=G.node[node]['prefix']+G.node[node]['seq']+G.node[node]['suffix']
    
    logging.debug("Done.")

    logging.debug("Write overlap graph...")
    utils.write_gfa(G,None,outputfile=gof,remap=False)
    logging.debug("Done.")

    if args.fasta:
        logging.debug("Write corresponding fasta file...")
        with open(fof,'w') as ff:
            for node in G.nodes():
                if type(node)==str: #skip start/end nodes
                    continue
                name=">"+str(node)+"\n"
                seq=G.node[node]['seq']
                ff.write(name)
                for i in range( (len(seq)/args.lw)+(len(seq) % args.lw > 0)):
                    ff.write(seq[i*args.lw:(i+1)*args.lw]+"\n")
        logging.debug("Done.")
    
    if args.check:
        logging.debug("Validate if all k-mers from the original graph are contained in overlap graph...")
        import extract
        r="$".join([G.node[node]['seq'] for node in G])
        for path in Gorg.graph['paths']:
            logging.debug("Check: %s"%path)
            s=extract.extract(Gorg,path)
            for i in xrange(len(s)-args.k):
                if r.find(s[i:i+args.k])==-1:
                    logging.error("Flat representation does not cover all k-length substrings for %s, could not find: %s!"%(path,s[i:i+args.k]))
                    sys.exit(1)
        logging.debug("Done.")
コード例 #3
0
def comp_cmd(args):
    g = nx.DiGraph()
    g.graph['paths'] = []
    utils.read_gfa(args.graph[0], None, None, g, targetsample=None)
    g = comp(g)
    utils.write_gfa(g,
                    "",
                    outputfile=args.graph[0].replace('.gfa', '.rc.gfa'),
                    nometa=False)
コード例 #4
0
ファイル: chop.py プロジェクト: jasperlinthorst/reveal
def chop_cmd(args):
    if not args.graph[0].endswith(".gfa"):
        logging.fatal("Invalid gfa file.")
        return

    G=nx.DiGraph()
    utils.read_gfa(args.graph[0], None, None, G, remap=False)
    
    # assert(len(G.edges())==0)
    if args.output==None:
        fof=os.path.splitext(args.graph[0])[0]+".chopped.fasta"
        gof=os.path.splitext(args.graph[0])[0]+".chopped.gfa"
    else:
        fof=args.output+".fasta"
        gof=args.output+".gfa"

    if args.check:
        Gorg=G.copy()

    chop(G,k=args.k,extend=args.extend)
    
    logging.debug("Merging node sequence...")
    for node in G.nodes():
        G.node[node]['seq']=G.node[node]['prefix']+G.node[node]['seq']+G.node[node]['suffix']
    logging.debug("Done.")

    logging.debug("Write overlap graph...")
    utils.write_gfa(G,None,outputfile=gof,remap=False)
    logging.debug("Done.")

    if args.fasta:
        logging.debug("Write corresponding fasta file...")
        with open(fof,'w') as ff:
            for node in G.nodes():
                name=">"+str(node)+"\n"
                seq=G.node[node]['seq']
                ff.write(name)
                for i in range( (len(seq)/args.lw)+(len(seq) % args.lw > 0)):
                    ff.write(seq[i*args.lw:(i+1)*args.lw]+"\n")
        logging.debug("Done.")
    
    if args.check:
        logging.debug("Validate if all k-mers from the original graph are contained in overlap graph...")
        import extract
        r="$".join([G.node[node]['seq'] for node in G])
        for path in Gorg.graph['paths']:
            logging.debug("Check: %s"%path)
            s=extract.extract(Gorg,path)
            for i in xrange(len(s)-args.k):
                if r.find(s[i:i+args.k])==-1:
                    logging.error("Flat representation does not cover all k-length substrings for %s, could not find: %s!"%(path,s[i:i+args.k]))
                    sys.exit(1)
        logging.debug("Done.")
コード例 #5
0
ファイル: unzip.py プロジェクト: jasperlinthorst/reveal
def unzip(args):
    if not args.graph[0].endswith(".gfa"):
        logging.fatal("Invalid gfa file.")
        return

    G=nx.MultiDiGraph()
    utils.read_gfa(args.graph[0], None, None, G, remap=False)

    if args.source==None and args.sink==None:
        unzip_graph(G,minunzip=args.minunzip)
    else:
        b=bubbles.Bubble(G,args.source,args.sink)
        unzip_bubble(G,b,minunzip=args.minunzip,idoffset=max([n for n in G.nodes() if type(n)==int])+1)

    if args.output==None:
        of=os.path.splitext(args.graph[0])[0]+".unzipped.gfa"
    else:
        of=args.output+".gfa"

    utils.write_gfa(G,None,outputfile=of)
コード例 #6
0
ファイル: convert.py プロジェクト: jasperlinthorst/reveal
def convert(args):
    for graph in args.graphs:
        
        if args.nocycles:
            g=nx.DiGraph()
        else:
            g=nx.MultiDiGraph()

        g.graph['paths']=[]
        g.graph['path2id']=dict()
        g.graph['id2path']=dict()

        if graph.endswith(".gfa"): #gfa to gml/gfa
            utils.read_gfa(graph,None,None,g,minsamples=args.minsamples,
                                 maxsamples=args.maxsamples,
                                 targetsample=args.targetsample)
            if args.type=="gfa":
                fn=graph.replace(".gfa",".rewrite.gfa")
                graph=utils.write_gfa(g,"", outputfile=fn)
                logging.info("gfa graph written to: %s"%fn)
            elif args.type=="gml":
                fn=utils.write_gml(g,"", hwm=args.hwm, outputfile=graph.replace(".gfa",""), partition=args.partition)
                logging.info("gml graph written to: %s"%fn)
            elif args.type=="maf":
                logging.info("Converting graph to maf..")
                graph2maf(g,graph.replace(".gfa",".maf"))

        elif graph.endswith(".fa") or graph.endswith(".fasta") or graph.endswith(".fna"): #assume fasta to gfa
            i=0
            for name,seq in utils.fasta_reader(graph):
                g.graph['paths'].append(os.path.basename(graph))
                g.graph['path2id'][os.path.basename(graph)]=0
                g.graph['id2path'][0]=os.path.basename(graph)
                g.add_node(i,offsets={0:0},seq=seq)
                i+=1
            filename=graph[:graph.rfind(".")]+".gfa"
            utils.write_gfa(g,"", outputfile=filename)
            logging.info("gfa graph written to: %s"%filename)
        else:
            logging.fatal("Unknown filetype, need gfa or fasta extension.")
            return
コード例 #7
0
ファイル: comp.py プロジェクト: jasperlinthorst/reveal
def comp_cmd(args):
    g=nx.DiGraph()
    g.graph['paths']=[]
    utils.read_gfa(args.graph[0],None,None,g,targetsample=None)
    g=comp(g)
    utils.write_gfa(g,"",outputfile=args.graph[0].replace('.gfa','.rc.gfa'), nometa=False)
コード例 #8
0
def convert(args):
    for graph in args.graphs:

        if args.nocycles:
            g = nx.DiGraph()
        else:
            g = nx.MultiDiGraph()

        g.graph['paths'] = []
        g.graph['path2id'] = dict()
        g.graph['id2path'] = dict()

        if graph.endswith(".gfa"):  #gfa to gml/gfa
            utils.read_gfa(graph,
                           None,
                           None,
                           g,
                           minsamples=args.minsamples,
                           maxsamples=args.maxsamples,
                           targetsample=args.targetsample,
                           remap=False)
            if args.type == "gfa":
                fn = graph.replace(".gfa", ".rewrite.gfa")
                graph = utils.write_gfa(g, "", outputfile=fn)
                logging.info("gfa graph written to: %s" % fn)
            elif args.type == "gml":
                fn = utils.write_gml(g,
                                     "",
                                     hwm=args.hwm,
                                     outputfile=graph.replace(".gfa", ""),
                                     partition=args.partition)
                logging.info("gml graph written to: %s" % fn)
            elif args.type == "maf":
                logging.info("Converting graph to maf..")
                graph2maf(g, graph.replace(".gfa", ".maf"))

        elif graph.endswith(
                ".maf"):  #multiple alignment format, convert to graph
            g = maf2graph(graph)
            filename = graph[:graph.rfind(".")] + ".gml"
            utils.write_gml(g, "", outputfile=filename)

            filename = graph[:graph.rfind(".")] + ".gfa"
            utils.write_gfa(g, "", outputfile=filename)
            logging.debug("gfa graph written to: %s" % filename)

        elif graph.endswith(".fa") or graph.endswith(
                ".fasta") or graph.endswith(".fna"):  #assume fasta to gfa
            if args.aligned:
                seqs = []
                names = []
                for name, seq in utils.fasta_reader(graph, keepdash=True):
                    seqs.append(seq)
                    names.append(name)
                g, nid = utils.aln2graph(seqs, names)
            else:
                i = 0
                start = uuid.uuid4().hex
                end = uuid.uuid4().hex
                g.graph['startnodes'] = [start]
                g.graph['endnodes'] = [end]
                g.add_node(start, offsets=dict())
                g.add_node(end, offsets=dict())
                for i, v in enumerate(utils.fasta_reader(graph)):
                    name, seq = v
                    g.graph['paths'].append(name)
                    g.graph['path2id'][name] = i
                    g.graph['id2path'][i] = name
                    g.node[start]['offsets'][i] = 0
                    g.node[end]['offsets'][i] = len(seq)
                    g.add_node(i, offsets={i: 0}, seq=seq)
                    g.add_edge(start, i, paths=set([i]))
                    g.add_edge(i, end, paths=set([i]))

            filename = graph[:graph.rfind(".")] + ".gfa"
            utils.write_gfa(g, "", outputfile=filename)
            logging.debug("gfa graph written to: %s" % filename)
        else:
            logging.fatal("Unknown filetype, need gfa or fasta extension.")
            return