def test3(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line = lines.addNew("ACGTACGTACGT", "c") dp = LineDotPlot(lines, self.aligner) al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12)) al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12)) dp.addAlignment(al1) dp.addAlignment(al2) alignment = AlignmentPiece.Identical( Contig("TCC", "tmp").asSegment(), line.segment(3, 6)) line.correctSequence([alignment]) assert str( list(dp.auto_alignments["c"]) ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
def test1(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1") line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()), params.alignment_correction_radius) dp = LineDotPlot(lines, self.aligner) dp.addAlignment(al) alignment = AlignmentPiece.Identical( Contig("AGG", "tmp").asSegment(), line2.segment(0, 3)) line2.correctSequence([alignment]) assert str(list(dp.alignmentsToFrom[line2.id][ line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads
def loadAll(handler): # type: (TokenReader) -> Tuple[Params, Aligner, ContigCollection, ReadCollection, DisjointigCollection, NewLineStorage, LineDotPlot] cl_params = Params() cl_params.load(handler) aligner = Aligner.load(handler) sys.stdout.info("Loading contigs") contigs = ContigCollection() contigs.load(handler) sys.stdout.info("Loading reads") reads = CreateReadCollection(cl_params.reads_file, cl_params.downsample) reads.loadFromFasta(open(cl_params.reads_file, "r"), downsample=params.downsample) tmp_reads = reads.copy().addAllRC() sys.stdout.info("Loading disjointigs") disjointigs = DisjointigCollection() disjointigs.load(handler, tmp_reads) sys.stdout.info("Loading lines") lines = NewLineStorage(disjointigs, aligner) lines.load(handler, tmp_reads, contigs) sys.stdout.info("Loading dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.load(handler) sys.stdout.info("Loading finished") return cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot
def assemble(args, bin_path): params.bin_path = bin_path start = time.time() cl_params = Params().parse(args) ref = ContigStorage() if cl_params.test: cl_params.reads_file = os.path.dirname(__file__) + "/../../test_dataset/reads.fasta" cl_params.genome_size = 30000 cl_params.dir = os.path.dirname(__file__) + "/../../test_results" ref.loadFromFile(os.path.dirname(__file__) + "/../../test_dataset/axbctbdy.fasta", False) if cl_params.debug: params.save_alignments = True cl_params.check() CreateLog(cl_params.dir) sys.stdout.info("Command line:", " ".join(cl_params.args)) sys.stdout.info("Started") if cl_params.debug: sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"])) sys.stdout.info("Modifications:") print subprocess.check_output(["git", "diff"]) sys.stdout.info("Preparing initial state") if cl_params.debug: save_handler = SaveHandler(os.path.join(cl_params.dir, "saves")) else: save_handler = None if cl_params.load_from is not None: # tmp = cl_params.focus sys.stdout.info("Loading initial state from saves") cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r"))) cl_params.parse(args) # cl_params.focus = tmp knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) dot_plot.printAll(sys.stdout) printState(lines) else: aligner = Aligner(DirDistributor(cl_params.alignmentDir())) polisher = Polisher(aligner, aligner.dir_distributor) reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample) if cl_params.contigs_file is None: sys.stdout.info("Running Flye") assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)]) cl_params.set_flye_dir(assembly_dir, cl_params.mode) elif len(cl_params.disjointigs_file_list) == 0: assembly_dir = os.path.join(cl_params.dir, "assembly_initial") reads_file = os.path.join(cl_params.dir, "actual_reads.fasta") reads.print_fasta(open(reads_file, "w")) disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir) # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir) cl_params.disjointigs_file_list.append(disjointigs_file) params.min_contra_for_break = 8 disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads) all_unique = cl_params.init_file is not None contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique) if cl_params.autoKL: adjustKL(aligner, reads, contigs) if cl_params.init_file is None: ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump) lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split) else: lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher) sys.stdout.info("Constructing dot plot") dot_plot = LineDotPlot(lines, aligner) dot_plot.construct(aligner) # dot_plot.printAll(sys.stdout) sys.stdout.info("Updating sequences and resolved segments.") knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot) extender = LineExtender(aligner, knotter, disjointigs, dot_plot) extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines)) for line in list(lines.unique()): # type: NewLine line.completely_resolved.mergeSegments() if len(line.completely_resolved) == 0: lines.removeLine(line) if cl_params.debug: sys.stdout.info( "Saving initial state") try: writer = save_handler.getWriter() sys.stdout.info("Save details:", writer.info) saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot) except Exception as e: _, _, tb = sys.exc_info() sys.stdout.warn("Could not write save") traceback.print_tb(tb) sys.stdout.INFO( "Message:", e.message) sys.stdout.trace( "Disjointig alignments") for line in lines: sys.stdout.trace( line.disjointig_alignments) sys.stdout.info("Starting expanding alignment-consensus loop") EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler) dot_plot.printAll(sys.stdout) sys.stdout.trace( "Final result:") lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w")) lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w")) printState(lines) sys.stdout.info("Finished") secs = int(time.time() - start) days = secs / 60 / 60 / 24 hours = secs / 60 / 60 % 24 mins = secs / 60 % 60 sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins)) if cl_params.test: passed = False for al in aligner.dotplotAlign(lines, ref): if len(al) > len(al.seg_to.contig) - 3000: passed = True break if passed: sys.stdout.info("Test passed") else: sys.stdout.info("Test failed")