Exemple #1
0
def graph(args):
    """
    %prog graph best.edges

    Convert Celera Assembler's "best.edges" to a GEXF which can be used to
    feed into Gephi to check the topology of the best overlapping graph. Mutual
    best edges are represented as thicker edges.

    Reference:
    https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py
    """
    p = OptionParser(graph.__doc__)
    p.add_option(
        "--query",
        default=-1,
        type="int",
        help="Search from node, -1 to select random node, 0 to disable",
    )
    p.add_option("--contig", help="Search from contigs, use comma to separate")
    p.add_option("--largest",
                 default=0,
                 type="int",
                 help="Only show largest components")
    p.add_option("--maxsize", default=500, type="int", help="Max graph size")
    p.add_option(
        "--nomutualbest",
        default=False,
        action="store_true",
        help="Do not plot mutual best edges as heavy",
    )
    add_graph_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bestedges, ) = args
    query = opts.query
    contig = opts.contig
    largest = opts.largest
    frgctg = opts.frgctg
    edgeweight = not opts.nomutualbest
    G = read_graph(bestedges, maxerr=opts.maxerr)

    if largest:
        H = list(nx.connected_component_subgraphs(G))
        c = min(len(H), largest)
        logging.debug("{0} components found, {1} retained".format(len(H), c))
        G = nx.Graph()
        for x in H[:c]:
            G.add_edges_from(x.edges())

    if query:
        if query == -1:
            query = choice(G.nodes())
        reads_to_ctgs = parse_ctgs(bestedges, frgctg)
        if contig:
            contigs = set(contig.split(","))
            core = [k for k, v in reads_to_ctgs.items() if v in contigs]
        else:
            ctg = reads_to_ctgs.get(query)
            core = [k for k, v in reads_to_ctgs.items() if v == ctg]
            logging.debug(
                "Reads ({0}) extended from the same contig {1}".format(
                    len(core), ctg))

        # Extract a local neighborhood
        SG = nx.Graph()
        H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize)
        SG.add_edges_from(H.edges(data=edgeweight))
        G = SG

        seen = []
        for n, attrib in G.nodes_iter(data=True):
            contig = reads_to_ctgs.get(n, "na")
            attrib["label"] = contig
            seen.append(contig)
        c = Counter(seen)
        cc = ["{0}({1})".format(k, v) for k, v in c.most_common()]
        print("Contigs: {0}".format(" ".join(cc)), file=sys.stderr)

    gexf = "best"
    if query >= 0:
        gexf += ".{0}".format(query)
    gexf += ".gexf"
    nx.write_gexf(G, gexf)
    logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".format(
        gexf, len(G), G.size()))
Exemple #2
0
def graph(args):
    """
    %prog graph best.edges

    Convert Celera Assembler's "best.edges" to a GEXF which can be used to
    feed into Gephi to check the topology of the best overlapping graph. Mutual
    best edges are represented as thicker edges.

    Reference:
    https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py
    """
    p = OptionParser(graph.__doc__)
    p.add_option("--query", default=-1, type="int", help="Search from node, -1 to select random node, 0 to disable")
    p.add_option("--contig", help="Search from contigs, use comma to separate")
    p.add_option("--largest", default=0, type="int", help="Only show largest components")
    p.add_option("--maxsize", default=500, type="int", help="Max graph size")
    p.add_option("--nomutualbest", default=False, action="store_true", help="Do not plot mutual best edges as heavy")
    add_graph_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    query = opts.query
    contig = opts.contig
    largest = opts.largest
    frgctg = opts.frgctg
    edgeweight = not opts.nomutualbest
    G = read_graph(bestedges, maxerr=opts.maxerr)

    if largest:
        H = list(nx.connected_component_subgraphs(G))
        c = min(len(H), largest)
        logging.debug("{0} components found, {1} retained".format(len(H), c))
        G = nx.Graph()
        for x in H[:c]:
            G.add_edges_from(x.edges())

    if query:
        if query == -1:
            query = choice(G.nodes())
        reads_to_ctgs = parse_ctgs(bestedges, frgctg)
        if contig:
            contigs = set(contig.split(","))
            core = [k for k, v in reads_to_ctgs.items() if v in contigs]
        else:
            ctg = reads_to_ctgs.get(query)
            core = [k for k, v in reads_to_ctgs.items() if v == ctg]
            logging.debug("Reads ({0}) extended from the same contig {1}".format(len(core), ctg))

        # Extract a local neighborhood
        SG = nx.Graph()
        H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize)
        SG.add_edges_from(H.edges(data=edgeweight))
        G = SG

        seen = []
        for n, attrib in G.nodes_iter(data=True):
            contig = reads_to_ctgs.get(n, "na")
            attrib["label"] = contig
            seen.append(contig)
        c = Counter(seen)
        cc = ["{0}({1})".format(k, v) for k, v in c.most_common()]
        print >> sys.stderr, "Contigs: {0}".format(" ".join(cc))

    gexf = "best"
    if query >= 0:
        gexf += ".{0}".format(query)
    gexf += ".gexf"
    nx.write_gexf(G, gexf)
    logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".format(gexf, len(G), G.size()))
Exemple #3
0
def graph(args):
    """
    %prog graph best.edges

    Convert Celera Assembler's "best.edges" to a GEXF which can be used to
    feed into Gephi to check the topology of the best overlapping graph.

    Reference:
    https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py
    """
    import networkx as nx
    from jcvi.algorithms.graph import graph_stats, graph_local_neighborhood

    p = OptionParser(graph.__doc__)
    p.add_option("--maxerr", default=100, type="int", help="Maximum error rate")
    p.add_option("--query", default=-1, type="int", help="Search from node")
    p.add_option("--largest", default=1, type="int", help="Only show largest components")
    p.add_option("--maxsize", default=100, type="int", help="Max graph size")
    p.add_option("--contigs", help="Annotate graph with contig membership, "
                    " typically from `asm.posmap.frgctg`")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    maxerr = opts.maxerr
    query = opts.query
    largest = opts.largest
    logging.debug("Max error = {0}%".format(maxerr))
    bestgraph = bestedges.split(".")[0] + ".err{0}.graph".format(maxerr)
    if need_update(bestedges, bestgraph):
        G = nx.Graph()
        fp = open(bestedges)
        for row in fp:
            if row[0] == '#':
                continue
            id1, lib_id, best5, o1, best3, o3, j1, j2 = row.split()
            id1, best5, best3 = int(id1), int(best5), int(best3)
            j1, j2 = float(j1), float(j2)
            if j1 < maxerr or j2 < maxerr:
                G.add_node(id1)
            if best5 != '0' and j1 < maxerr:
                G.add_edge(best5, id1)
            if best3 != '0' and j2 < maxerr:
                G.add_edge(id1, best3)
        nx.write_gpickle(G, bestgraph)
        logging.debug("Graph pickled to `{0}`".format(bestgraph))

    logging.debug("Read graph from `{0}`".format(bestgraph))
    G = nx.read_gpickle(bestgraph)
    graph_stats(G)

    if len(G) > 10000:
        SG = nx.Graph()
        H = graph_local_neighborhood(G, query=query,
                                     maxsize=opts.maxsize)
        SG.add_edges_from(H.edges())
        G = SG

    if largest > 1:  # only works for un-directed graph
        H = nx.connected_component_subgraphs(G)
        c = min(len(H), largest)
        logging.debug("{0} components found, {1} retained".format(len(H), c))

        G = nx.Graph()
        for x in H[:c]:
            G.add_edges_from(x.edges())

    if opts.contigs:
        reads_to_ctgs = parse_ctgs(bestedges, opts.contigs)
        annotate_contigs(G, reads_to_ctgs)

    gexf = "best"
    if query >= 0:
        gexf += ".{0}".format(query)
    gexf += ".gexf"
    nx.write_gexf(G, gexf)
    logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\
                    format(gexf, len(G), G.size()))