def merge(args): """ %prog merge graphs Merge multiple graphs together and visualize. """ p = OptionParser(merge.__doc__) p.add_option( "--colorlist", default="black,red,pink,blue,green", help="The color palette", ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) colorlist = opts.colorlist.split(",") assert len(colorlist) >= len(args), "Need more colors in --colorlist" g = BiGraph() for a, c in zip(args, colorlist): g.read(a, color=c) g.draw("merged.png")
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}".format( len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk G = BiGraph() for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) #add_bed_to_graph(G, bed, families) print_edges(G, bed, families)
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path", ) p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance", ) p.set_verbose(help="Print verbose reports to stdout") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def graph(self): g = BiGraph() for scaffold, lines in self.iter_scaffold(): self.scf[scaffold] = [x.tig for x in lines] for a, b in pairwise(lines): g.add_edge(a.tig, b.tig, a.o, b.o, length=a.gaps) if len(lines) == 1: # Singleton scaffold a = lines[0] g.add_node(a.tig) return g
def __init__(self, filename): super(OVL, self).__init__(filename) fp = must_open(filename) contained = set() alledges = defaultdict(list) for row in fp: o = OVLLine(row) self.append(o) if o.tag == "a in b": contained.add(o.a) elif o.tag == "b in a": contained.add(o.b) if o.tag == "a->b": alledges[o.a + "-3`"].append(o) elif o.tag == "b->a": alledges[o.a + "-5`"].append(o) logging.debug( "Imported {} links. Contained tigs: {}".format(len(self), len(contained)) ) self.contained = contained logging.debug("Pruning edges to keep the mutual best") for k, v in alledges.items(): bo = max(v, key=lambda x: x.score) bo.best = True self.graph = BiGraph() for o in self: if not o.best: continue if o.tag == "a->b": a, b = o.a, o.b elif o.tag == "b->a": a, b = o.b, o.a if a in contained or b in contained: continue bstrand = "<" if o.bstrand == "-" else ">" self.graph.add_edge(a, b, ">", bstrand, length=o.score)
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ p = OptionParser(connect.__doc__) p.add_option( "--clip", default=2000, type="int", help="Only consider end of contigs", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print("\n".join(str(x) for x in bb)) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print(ov, alen, blen) if ov and (ov > alen / 2 or ov > blen / 2): print("Too much overlap ({0})".format(ov)) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, fastafile, verbose=False)
def partition(args): """ %prog partition happy.txt synteny.graph Select edges from another graph and merge it with the certain edges built from the HAPPY mapping data. """ allowed_format = ("png", "ps") p = OptionParser(partition.__doc__) p.add_option("--prefix", help="Add prefix to the name") p.add_option( "--namestart", default=0, type="int", help="Use a shorter name, starting index", ) p.add_option( "--format", default="png", choices=allowed_format, help="Generate image of format", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) happyfile, graphfile = args bg = BiGraph() bg.read(graphfile, color="red") prefix = opts.prefix fp = open(happyfile) for i, row in enumerate(fp): nns = happy_nodes(row, prefix=prefix) nodes = set(nns) edges = happy_edges(row, prefix=prefix) small_graph = BiGraph() for (a, b, oa, ob), is_uncertain in edges: color = "gray" if is_uncertain else "black" small_graph.add_edge(a, b, oa, ob, color=color) for (u, v), e in bg.edges.items(): # Grab edge if both vertices are on the same line if u in nodes and v in nodes: uv = (str(u), str(v)) if uv in small_graph.edges: e = small_graph.edges[uv] e.color = "blue" # supported by both evidences else: small_graph.add_edge(e) print(small_graph, file=sys.stderr) pngfile = "A{0:02d}.{1}".format(i + 1, opts.format) telomeres = (nns[0], nns[-1]) small_graph.draw( pngfile, namestart=opts.namestart, nodehighlight=telomeres, dpi=72 ) legend = [ "Edge colors:", "[BLUE] Experimental + Synteny", "[BLACK] Experimental certain", "[GRAY] Experimental uncertain", "[RED] Synteny only", "Rectangle nodes are telomeres.", ] print("\n".join(legend), file=sys.stderr)
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))