def align(dir, contigs_file): CreateLog(dir) contigs = list(SeqIO.parse_fasta(open(contigs_file, "r"))) assert len(contigs) == 2 contigs = [ Contig(contigs[0].seq, contigs[0].id), Contig(contigs[1].seq, contigs[1].id) ] aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) als = iter_align(aligner, contigs[0], contigs[1]) printVar(os.path.join(dir, "diff.txt"), als) for al in als: print al
def main(args): dir = args[4] basic.ensure_dir_existance(dir) CreateLog(dir) sys.stdout.info("Starting graph-free recruitment") print " ".join(args) sys.stdout.info("Loading repeat sequences") seqs = ContigStorage().loadFromFasta(open(args[1], "r"), False) sys.stdout.info("Loading reads") reads = ContigStorage().loadFromFasta(open(args[2], "r"), False) k = int(args[3]) recruit(seqs, reads, k, dir) sys.stdout.info("Finised graph-free recruitment")
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
return new_contig, relevant_als + finished_als class FakePolishingArgs: def __init__(self): self.num_iters = params.num_iters self.platform = params.technology self.threads = params.threads if __name__ == "__main__": reads_file = sys.argv[2] consensus_file = sys.argv[3] dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid]
def assemble(args, bin_path): params.bin_path = bin_path start = time.time() cl_params = Params().parse(args) ref = ContigStorage() if cl_params.test: cl_params.reads_file = os.path.dirname(__file__) + "/../../test_dataset/reads.fasta" cl_params.genome_size = 30000 cl_params.dir = os.path.dirname(__file__) + "/../../test_results" ref.loadFromFile(os.path.dirname(__file__) + "/../../test_dataset/axbctbdy.fasta", False) if cl_params.debug: params.save_alignments = True cl_params.check() CreateLog(cl_params.dir) sys.stdout.info("Command line:", " ".join(cl_params.args)) sys.stdout.info("Started") if cl_params.debug: sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) sys.stdout.info("Modifications:") print subprocess.check_output(["git", "diff"]) sys.stdout.info("Preparing initial state") if cl_params.debug: save_handler = SaveHandler(os.path.join(cl_params.dir, "saves")) else: save_handler = None if cl_params.load_from is not None: # tmp = cl_params.focus sys.stdout.info("Loading initial state from saves") cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r"))) cl_params.parse(args) # cl_params.focus = tmp knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) dot_plot.printAll(sys.stdout) printState(lines) else: aligner = Aligner(DirDistributor(cl_params.alignmentDir())) polisher = Polisher(aligner, aligner.dir_distributor) reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample) if cl_params.contigs_file is None: sys.stdout.info("Running Flye") assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)]) cl_params.set_flye_dir(assembly_dir, cl_params.mode) elif len(cl_params.disjointigs_file_list) == 0: assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir) # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir) cl_params.disjointigs_file_list.append(disjointigs_file) params.min_contra_for_break = 8 disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads) all_unique = cl_params.init_file is not None contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique) if cl_params.autoKL: adjustKL(aligner, reads, contigs) if cl_params.init_file is None: ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump) lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split) else: lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher) sys.stdout.info("Constructing dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.construct(aligner) # dot_plot.printAll(sys.stdout) sys.stdout.info("Updating sequences and resolved segments.") knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines)) for line in list(lines.unique()): # type: NewLine line.completely_resolved.mergeSegments() if len(line.completely_resolved) == 0: lines.removeLine(line) if cl_params.debug: sys.stdout.info( "Saving initial state") try: writer = save_handler.getWriter() sys.stdout.info("Save details:", writer.info) saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot) except Exception as e: _, _, tb = sys.exc_info() sys.stdout.warn("Could not write save") traceback.print_tb(tb) sys.stdout.INFO( "Message:", e.message) sys.stdout.trace( "Disjointig alignments") for line in lines: sys.stdout.trace( line.disjointig_alignments) sys.stdout.info("Starting expanding alignment-consensus loop") EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler) dot_plot.printAll(sys.stdout) sys.stdout.trace( "Final result:") lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w")) lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w")) printState(lines) sys.stdout.info("Finished") secs = int(time.time() - start) days = secs / 60 / 60 / 24 hours = secs / 60 / 60 % 24 mins = secs / 60 % 60 sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins)) if cl_params.test: passed = False for al in aligner.dotplotAlign(lines, ref): if len(al) > len(al.seg_to.contig) - 3000: passed = True break if passed: sys.stdout.info("Test passed") else: sys.stdout.info("Test failed")
import sys sys.path.append("py") from common import basic, params from common.basic import CreateLog from alignment.align_tools import Aligner, DirDistributor from common.line_align import Scorer from common.sequences import ContigStorage if __name__ == "__main__": basic.ensure_dir_existance(sys.argv[1]) CreateLog(sys.argv[1]) reads = ContigStorage().loadFromFile(sys.argv[2]) contigs = ContigStorage().loadFromFile(sys.argv[3]) scorer = Scorer() dd = DirDistributor(sys.argv[1]) aligner = Aligner(dd) for read in reads.unique(): print "Processing read", read als = [ scorer.polyshAlignment(al, params.alignment_correction_radius) for al in aligner.localAlign([read], contigs) ] for al1 in als: for al2 in als: if al1.seg_to.contig == al2.seg_to.contig: continue print al1, "vs", al2 scorer.scoreInCorrectSegments(al1,
def main(args): flye_dir = sys.argv[1] repeats, starts, ends = parse(sys.argv[2]) graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir( flye_dir) dump = os.path.join(rep_dir, "read_alignment_dump") reads_file = sys.argv[3] dir = sys.argv[4] CreateLog(dir) print " ".join(args) print "Printing contigs" edges_file = os.path.join(rep_dir, "graph_before_rr.fasta") edges = ContigStorage().loadFromFasta(open(edges_file, "r")) unique = open(os.path.join(dir, "contigs"), "w") for l in starts: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[-15000:] SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique, "fasta") for l in ends: seq = "".join(map(lambda eid: edges[eid].seq, l)) if len(seq) > 15000: seq = seq[:15000] SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"), unique, "fasta") unique.close() print "Selecting reads" reads = set() cur_read = None als = [] for s in open(dump).readlines(): if s.startswith("Chain"): if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print als for al in als: if al in repeats: if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print "oppa" reads.add(cur_read) break als = [] else: s = s.split() cur_read = s[2][1:] eid = s[6].split("_")[1] if s[6][0] == "-": eid = "-" + eid if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e": print eid als.append(eid) print "Selected", len(reads), "reads" print "\n".join(reads) print "Reading and printing reads" freads = open(os.path.join(dir, "reads.fasta"), "w") cnt = 0 for read in SeqIO.parse_by_name(reads_file): cnt += 1 if cnt % 10000 == 0: print cnt if read.id in reads: SeqIO.write(read, freads, "fasta") freads.close()
def main(flye_dir, output_dir, diploid): basic.ensure_dir_existance(output_dir) CreateLog(output_dir) print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) print("Modifications:") print subprocess.check_output(["git", "diff"]) graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv") edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta") dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump") if diploid: calculator = DipolidCalculator(150000) else: calculator = HaploidCalculator(150000) print "Reading graph from", graph_file graph = SimpleGraph() graph.ReadDot(graph_file) print "Reading sequences from", edge_file graph.FillSeq(edge_file, True) print "Splitting graph", edge_file componentRecords, edgecomp = constructComponentRecords(graph, calculator) print "Reading alignment dump from", dump_file rcnt = 0 for rid, eids in AlignmentDumpParser(dump_file).parse(): compids = set() eids = map(basic.Normalize, eids) for eid in eids: for compid in edgecomp[eid]: compids.add(compid) for compid in compids: comp_eids = [ eid for eid in eids if eid in componentRecords[compid].component.e ] if comp_eids.__len__() == 0: print "GOPA", compid, compids, rid, eids componentRecords[compid].addRead(rid, eids) rcnt += 1 if rcnt % 100000 == 0: print "Processed", rcnt, "reads" print "Filling flye repeat resolution results" flye_next = FillFlyeNext(componentRecords, os.path.join(flye_dir, "flye.log")) for compRec in componentRecords: half = compRec.half() for norm_eid in compRec.unique: for eid in [norm_eid, basic.Reverse(norm_eid)]: if eid not in compRec.component.e: assert not basic.isCanonocal(eid) assert basic.Reverse(eid) in compRec.component.e continue if compRec.component.e[eid].end in half: if compRec.component.isBorder( compRec.component.e[eid].end): compRec.out += 1 if compRec.component.isBorder( compRec.component.e[eid].start): compRec.inc += 1 if not compRec.component.isBorder( compRec.component.e[eid].end): if flye_next[eid] is None: compRec.unresolved_connections += 1 else: compRec.resolved_connections.append( (eid, flye_next[eid])) if flye_next[eid] not in compRec.component.e: compRec.outside_connections += 1 basic.ensure_dir_existance(output_dir) print "Printing components to disk" subdataset_dir = os.path.join(output_dir, "subdatasets") basic.ensure_dir_existance(subdataset_dir) order = range(componentRecords.__len__()) order = sorted(order, key=lambda i: componentRecords[i].score()) ordered_components = [ componentRecords[order[i]] for i in range(len(order)) ] componentRecords = ordered_components basic.ensure_dir_existance(os.path.join(output_dir, "pics")) for i, component in enumerate(componentRecords): comp_dir = os.path.join(subdataset_dir, str(i)) component.dump(comp_dir) fig_name = os.path.join(comp_dir, "graph.dot") component.draw(fig_name, calculator) if component.component.__len__() <= 100: fig_file = os.path.join(output_dir, "pics", str(i) + ".dot") component.draw(fig_file, calculator) table_file = os.path.join(output_dir, "table.txt") print "Printing table to file", table_file f = open(table_file, "w") f.write( "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n" ) for i, compRec in enumerate(componentRecords): comp = compRec.component f.write(" ".join([ str(i), str(comp.v.__len__()), str(comp.e.__len__()), str(compRec.unique.__len__() * 2), str(compRec.inc), str(compRec.out), str(compRec.repeat_edges), str(compRec.unresolved_connections), str(compRec.resolved_connections.__len__()), str(compRec.outside_connections), str(compRec.zero), str(compRec.red), str(compRec.bad_border), str(compRec.overcovered_edges), str(compRec.score()) ]) + "\n") f.close() table_file = os.path.join(output_dir, "list.txt") f = open(table_file, "w") for a in range(len(componentRecords)): f.write(str(a) + "\n") f.close()
def main(model_file, k, dir, contigs_file, reads_file): # type: (str, int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.scores = ComplexScores() params.scores.load(open(model, "r")) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) for read in reads: if not basic.isCanonocal(read.id): continue cnt = 0 al0 = None others = [] for al in read.alignments: if not al.contradictingRTC(): cnt += 1 al0 = al else: others.append(al) if cnt != 1 or len(others) == 0: continue print al0 print others seg = al0.seg_from for al in others: if al.seg_from.interSize(seg) < k: seg = None break else: seg = al.seg_from.cap(seg) print seg if seg is None: continue al0 = al0.reduce(query=seg) others = [al.reduce(query=seg) for al in others] scorer = Scorer(params.scores) for al in others: a, b, c = scorer.scoreCommon(al0, al) print "win", a, b, c, len(seg) if len(seg) > 1000: for i in range(len(seg) / 1000): seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000) for al in others: a, b, c = scorer.scoreCommon(al0.reduce(query=seg1), al.reduce(query=seg1)) print "win1000", a, b, c, len(seg1) for al1 in others: for al2 in others: if al1 == al2: continue a, b, c = scorer.scoreCommon(al1, al2) print "draw", a, b, c, len(seg)