def extractSubgraph(gf): # graph = SimpleGraph().ReadDot(os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv")) graph1 =SimpleGraph().ReadDot(os.path.join(gf)) vertex_ids = graph1.v.keys() # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids]) print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids]) print " ".join(graph1.e.keys())
def extractSubgraph(dir, flye_dir, contigs): basic.ensure_dir_existance(dir) d = parseUPaths(flye_dir) edge_ids = [] for contig in contigs: for s in d[contig]: edge_ids.append(s) graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv")) vertex_ids = set() len = 0 for eid in edge_ids: len += graph.e[eid].len vertex_ids.add(graph.e[eid].start) vertex_ids.add(graph.e[eid].end) if len > 10000: break # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids]) print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids])
import sys import os sys.path.append("py") from common.SimpleGraph import SimpleGraph from common import basic g = SimpleGraph() g.ReadDot(sys.argv[1]) basic.ensure_dir_existance(sys.argv[2]) args = sys.argv[3:] if "merge" in args: g = g.Merge() cnt = 0 oppa = [] for comp in g.Split(1000000000): if len(comp) < 3: if len(g.v[comp[0]].inc) + len(g.v[comp[0]].out) + len( g.v[comp[-1]].inc) + len(g.v[comp[-1]].out) <= 2: pass else: oppa.extend(comp) if len(oppa) > 30: comp = list(oppa) oppa = [] else: continue print cnt, len(comp) f = open(os.path.join(sys.argv[2], str(cnt) + ".dot"), "w") g.Draw(comp, f)
def main(flye_dir, rf, dir, edge_id, k): params.technology = "nano" params.k = k basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) print "Reading graph" graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa")) print "Parsing edge mapping" id_map = parseUPaths(flye_dir) edge_ids = edge_id.split(",") print "Extracting relevant graph component" res = open(os.path.join(dir, "contigs.fasta"), "w") unique = dict() for eid in edge_ids: for e in graph.v[graph.e[eid].start].inc: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "l"] = NamedSequence( basic.RC(e.seq[:5000]), e.id[1:] + "l") else: unique[e.id + "r"] = NamedSequence(e.seq[-5000:], e.id + "r") for e in graph.v[graph.e[eid].end].out: if basic.Normalize(e.id) in edge_ids: continue if len(e.seq) < 10000: if e.id.startswith("-"): unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:]) else: unique[e.id] = NamedSequence(e.seq, e.id) else: if e.id.startswith("-"): unique[e.id[1:] + "r"] = NamedSequence( basic.RC(e.seq[-5000:]), e.id[1:] + "r") else: unique[e.id + "l"] = NamedSequence(e.seq[:5000], e.id + "l") for c in unique.values(): print c.id SeqIO.write(c, res, "fasta") res.close() old_ids = [] for eid in edge_ids: for olde in id_map[eid[len("edge_"):]]: old_ids.append(basic.Normalize(olde)) print "Finding reads that align to", edge_ids print "Old ids:", old_ids relevant_read_ids = set() for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"), "r").readlines(): s = s.split() if s[0] != "Aln": continue if s[6].split("_")[1] in old_ids: relevant_read_ids.add(s[2][1:]) print s[2][1:], s[6].split("_")[1] print "Reading reads" res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_fasta(open(rf, "r")): if read.id in relevant_read_ids and len(read) > k * 1.2: SeqIO.write(read, res, "fasta") res.close()
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
import sys sys.path.append("py") from common import basic, SeqIO from common.SimpleGraph import SimpleGraph graph = SimpleGraph().ReadGFA(sys.argv[1]) for e_id in graph.e: if basic.isCanonocal(e_id): SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
import os import sys sys.path.append("py") from common import basic, SeqIO from common.SimpleGraph import SimpleGraph if __name__ == "__main__": g = SimpleGraph() g.ReadGFA(sys.argv[1]) fasta = open(sys.argv[2] + ".fasta", "w") dot = open(sys.argv[2] + ".dot", "w") if "merge" in sys.argv: g = g.Merge() g.Print(dot) g.PrintFasta(fasta) fasta.close() dot.close()
if len(comp) > 1: yield graph.Component(comp) def SplitGraph(graph, calculator): max_cov = graph.covPerc(0.5) for comp in SplitGraphByCondition(graph, lambda edge: edge.len >= calculator.edge_length and edge.cov < max_cov * 1.8): cov = calculator.calculateComponentCoverage(comp, max_cov) if cov == 0: print "Zero component" for comp1 in SplitGraphByCondition(comp, calculator.uniqueCondition(cov)): print comp1.v.__len__(), comp1.e.__len__() yield comp1, cov if __name__ == "__main__": g = SimpleGraph() g.ReadDot(sys.argv[1]) basic.ensure_dir_existance(sys.argv[2]) args = sys.argv[3:] if "merge" in args: g = g.Merge() diploid = "--diploid" in args cnt = 0 oppa = [] simple = 0 complex = 0 max_cov = g.covPerc(0.5) if diploid: calculator = DipolidCalculator(150000) else: calculator = HaploidCalculator(150000)
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length): params.technology = "nano" basic.ensure_dir_existance(dir) basic.CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) print " ".join(sys.argv) print "Reading graph" graph = SimpleGraph().ReadDot( os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")) graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"), True) print "Extracting relevant graph component" edge_ids = edge_id.split(",") to_resolve = to_resolve.split(",") to_resolve = [(a, int(b)) for a, b in zip(to_resolve[0::2], to_resolve[1::2])] unique = uniqueNeighbours(edge_ids, graph, min_contig_length) if rf == "none": return print "Finding reads that align to", edge_ids reads_to_resolve = dict() # type: Dict[str, List[str]] for eid, mult in to_resolve: reads_to_resolve[eid] = [] for unique_edge, initial in unique: reads_to_resolve[initial] = [] relevant_read_ids = set() for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if eid in edge_ids: relevant_read_ids.add(rid) print rid, eid for rid, eid in parseReadDump( os.path.join(flye_dir, "20-repeat", "read_alignment_dump")): if rid in relevant_read_ids and eid in reads_to_resolve: reads_to_resolve[eid].append(rid) for eid in reads_to_resolve: reads_to_resolve[eid] = list(set(reads_to_resolve[eid])) print "Reading reads" res_reads = ContigStorage() res = open(os.path.join(dir, "reads.fasta"), "w") for read in SeqIO.parse_by_name(rf): if read.id in relevant_read_ids: res_reads.add(Contig(read.seq, read.id)) SeqIO.write(read, res, "fasta") res.close() random_down = open(os.path.join(dir, "random_down.fasta"), "w") cnt = 0 for read in res_reads: if cnt % 5 == 0: SeqIO.write(read, random_down, "fasta") cnt += 1 random_down.close() res = open(os.path.join(dir, "contigs.fasta"), "w") lcf = open(os.path.join(dir, "contigs.lc"), "w") for eid, mult in to_resolve: repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]] print reads_to_resolve[eid] print map(str, repeat_reads) split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult, repeat_reads, min_contig_length) if split_contigs is None: print "Failed to resove edge", eid, "Aborting" print "Edge", eid, "was split into", mult, "copies" for contig, contig_reads in split_contigs: print contig.id SeqIO.write(contig, res, "fasta") lcf.write(contig.id + "\n") lcf.write(" ".join([r.id for r in contig_reads]) + "\n") res = open(os.path.join(dir, "contigs.fasta"), "w") for unique_edge, initial in unique: print unique_edge.id SeqIO.write(unique_edge, res, "fasta") lcf.write(unique_edge.id + "\n") lcf.write(" ".join(reads_to_resolve[initial]) + "\n") res.close()
def main(flye_dir, output_dir, diploid): basic.ensure_dir_existance(output_dir) CreateLog(output_dir) print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) print("Modifications:") print subprocess.check_output(["git", "diff"]) graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv") edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta") dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump") if diploid: calculator = DipolidCalculator(150000) else: calculator = HaploidCalculator(150000) print "Reading graph from", graph_file graph = SimpleGraph() graph.ReadDot(graph_file) print "Reading sequences from", edge_file graph.FillSeq(edge_file, True) print "Splitting graph", edge_file componentRecords, edgecomp = constructComponentRecords(graph, calculator) print "Reading alignment dump from", dump_file rcnt = 0 for rid, eids in AlignmentDumpParser(dump_file).parse(): compids = set() eids = map(basic.Normalize, eids) for eid in eids: for compid in edgecomp[eid]: compids.add(compid) for compid in compids: comp_eids = [ eid for eid in eids if eid in componentRecords[compid].component.e ] if comp_eids.__len__() == 0: print "GOPA", compid, compids, rid, eids componentRecords[compid].addRead(rid, eids) rcnt += 1 if rcnt % 100000 == 0: print "Processed", rcnt, "reads" print "Filling flye repeat resolution results" flye_next = FillFlyeNext(componentRecords, os.path.join(flye_dir, "flye.log")) for compRec in componentRecords: half = compRec.half() for norm_eid in compRec.unique: for eid in [norm_eid, basic.Reverse(norm_eid)]: if eid not in compRec.component.e: assert not basic.isCanonocal(eid) assert basic.Reverse(eid) in compRec.component.e continue if compRec.component.e[eid].end in half: if compRec.component.isBorder( compRec.component.e[eid].end): compRec.out += 1 if compRec.component.isBorder( compRec.component.e[eid].start): compRec.inc += 1 if not compRec.component.isBorder( compRec.component.e[eid].end): if flye_next[eid] is None: compRec.unresolved_connections += 1 else: compRec.resolved_connections.append( (eid, flye_next[eid])) if flye_next[eid] not in compRec.component.e: compRec.outside_connections += 1 basic.ensure_dir_existance(output_dir) print "Printing components to disk" subdataset_dir = os.path.join(output_dir, "subdatasets") basic.ensure_dir_existance(subdataset_dir) order = range(componentRecords.__len__()) order = sorted(order, key=lambda i: componentRecords[i].score()) ordered_components = [ componentRecords[order[i]] for i in range(len(order)) ] componentRecords = ordered_components basic.ensure_dir_existance(os.path.join(output_dir, "pics")) for i, component in enumerate(componentRecords): comp_dir = os.path.join(subdataset_dir, str(i)) component.dump(comp_dir) fig_name = os.path.join(comp_dir, "graph.dot") component.draw(fig_name, calculator) if component.component.__len__() <= 100: fig_file = os.path.join(output_dir, "pics", str(i) + ".dot") component.draw(fig_file, calculator) table_file = os.path.join(output_dir, "table.txt") print "Printing table to file", table_file f = open(table_file, "w") f.write( "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n" ) for i, compRec in enumerate(componentRecords): comp = compRec.component f.write(" ".join([ str(i), str(comp.v.__len__()), str(comp.e.__len__()), str(compRec.unique.__len__() * 2), str(compRec.inc), str(compRec.out), str(compRec.repeat_edges), str(compRec.unresolved_connections), str(compRec.resolved_connections.__len__()), str(compRec.outside_connections), str(compRec.zero), str(compRec.red), str(compRec.bad_border), str(compRec.overcovered_edges), str(compRec.score()) ]) + "\n") f.close() table_file = os.path.join(output_dir, "list.txt") f = open(table_file, "w") for a in range(len(componentRecords)): f.write(str(a) + "\n") f.close()