Beispiel #1
0
 def testCase(self, instance):
     data = TokenReader(StringIO(" ".join(instance)))
     dataset = TestDataset.loadStructure(data)
     disjointigs = DisjointigCollection()
     for dis in dataset.disjointigs:
         disjointigs.addNew(dis.seq, dis.id)
     dp = DotPlot(disjointigs)
     dp.construct(self.aligner)
     save = StringIO()
     save_handler = TokenWriter(save)
     dp.save(save_handler)
     tmp = save.getvalue()
     test_result = tmp.replace(" ", "").replace("\n", "")
     ethalon = data.readToken()
     if test_result != ethalon:
         for dis in disjointigs:
             print list(dp.allInter(dis.asSegment()))
     assert test_result == ethalon, "\n" + test_result + "\n" + ethalon
Beispiel #2
0
 def test3(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line = lines.addNew("ACGTACGTACGT", "c")
     dp = LineDotPlot(lines, self.aligner)
     al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12))
     al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12))
     dp.addAlignment(al1)
     dp.addAlignment(al2)
     alignment = AlignmentPiece.Identical(
         Contig("TCC", "tmp").asSegment(), line.segment(3, 6))
     line.correctSequence([alignment])
     assert str(
         list(dp.auto_alignments["c"])
     ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
Beispiel #3
0
 def test1(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1")
     line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()),
         params.alignment_correction_radius)
     dp = LineDotPlot(lines, self.aligner)
     dp.addAlignment(al)
     alignment = AlignmentPiece.Identical(
         Contig("AGG", "tmp").asSegment(), line2.segment(0, 3))
     line2.correctSequence([alignment])
     assert str(list(dp.alignmentsToFrom[line2.id][
         line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
Beispiel #4
0
def loadAll(handler):
    # type: (TokenReader) -> Tuple[Params, Aligner, ContigCollection, ReadCollection, DisjointigCollection, NewLineStorage, LineDotPlot]
    cl_params = Params()
    cl_params.load(handler)
    aligner = Aligner.load(handler)
    sys.stdout.info("Loading contigs")
    contigs = ContigCollection()
    contigs.load(handler)
    sys.stdout.info("Loading reads")
    reads = CreateReadCollection(cl_params.reads_file, cl_params.downsample)
    reads.loadFromFasta(open(cl_params.reads_file, "r"),
                        downsample=params.downsample)
    tmp_reads = reads.copy().addAllRC()
    sys.stdout.info("Loading disjointigs")
    disjointigs = DisjointigCollection()
    disjointigs.load(handler, tmp_reads)
    sys.stdout.info("Loading lines")
    lines = NewLineStorage(disjointigs, aligner)
    lines.load(handler, tmp_reads, contigs)
    sys.stdout.info("Loading dot plot")
    dot_plot = LineDotPlot(lines, aligner)
    dot_plot.load(handler)
    sys.stdout.info("Loading finished")
    return cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot
 def genAll(self, aligner):
     # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
     disjointigs = DisjointigCollection()
     for dis in self.disjointigs:
         disjointigs.addNew(dis.seq, dis.id)
     from disjointig_resolve.line_storage import NewLineStorage
     lines = NewLineStorage(disjointigs, aligner)
     lines.name_printer = lambda line: line.id + "_" + self.translateBack(
         line, aligner)
     for line in self.contigs:
         new_line = lines.addNew(line.seq, line.id)
         new_line.initial.add(
             AlignmentPiece.Identical(
                 new_line.asSegment().asContig().asSegment(),
                 new_line.asSegment()))
     dp = LineDotPlot(lines, aligner)
     dp.construct(aligner)
     lines.alignDisjointigs()
     reads = ReadCollection()
     for read in self.reads:
         reads.addNewRead(read)
     disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
     return lines, dp, reads
def CreateDisjointigCollection(d_files, dir, aligner, reads):
    sys.stdout.info("Creating disjointig collection")
    disjointigs = DisjointigCollection()
    for f in d_files:
        disjointigs.loadFromFasta(open(f, "r"))
    sys.stdout.info("Extending disjointig collection")
    clen = 5000000
    bad_reads = reads.cleanCopy()
    tlen0 = sum(map(len, bad_reads))
    good_reads = set()
    for al in aligner.localAlign(reads, disjointigs):
        if not al.contradictingRTC(al.seg_to.contig.asSegment(), params.bad_end_length) and len(al.seg_from.contig) > len(al) - 2 * params.bad_end_length:
            good_reads.add(al.seg_from.contig.id)
    sys.stdout.info("Fraction of reads without full alignment to disjointigs:", 1 - float(len(good_reads)) / len(reads))
    if len(good_reads) > 0.99 * len(bad_reads):
        sys.stdout.info("Alomst all reads have good alignments. Skipping disjointig collection extension.")
        disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
        return disjointigs
    rf = os.path.join(dir, "badreads.fasta")
    bad_reads = bad_reads.filter(lambda read: read.id not in good_reads)
    tlen = sum(map(len, bad_reads))
    bad_reads.print_fasta(open(rf, "w"))
    l = tlen * clen / tlen0
    assembly_dir = os.path.join(dir, "assembly0")
    disjointigs_file = constructDisjointigs(bad_reads, l, assembly_dir)
    code = 0

    if code == 0:
        disjointigs.loadFromFasta(open(disjointigs_file, "r"))
        sys.stdout.trace("Disjointigs:")
        for dis in disjointigs:
            sys.stdout.trace(dis.id, len(dis))
        disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w"))
    else:
        sys.stdout.trace("Could not assemble new disjointigs")
    sys.stdout.info("Aligning reads to disjointigs")
    disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
    return disjointigs
Beispiel #7
0
def assemble(args, bin_path):
    params.bin_path = bin_path
    start = time.time()
    cl_params = Params().parse(args)
    ref = ContigStorage()
    if cl_params.test:
        cl_params.reads_file = os.path.dirname(
            __file__) + "/../../test_dataset/reads.fasta"
        cl_params.genome_size = 30000
        cl_params.dir = os.path.dirname(__file__) + "/../../test_results"
        ref.loadFromFile(
            os.path.dirname(__file__) + "/../../test_dataset/axbctbdy.fasta",
            False)
    if cl_params.debug:
        params.save_alignments = True
    cl_params.check()
    CreateLog(cl_params.dir)
    sys.stdout.info("Command line:", " ".join(cl_params.args))
    sys.stdout.info("Started")
    if cl_params.debug:
        sys.stdout.info("Version:",
                        subprocess.check_output(["git", "rev-parse", "HEAD"]))
        sys.stdout.info("Modifications:")
        print subprocess.check_output(["git", "diff"])
    sys.stdout.info("Preparing initial state")
    # if cl_params.debug:
    #     save_handler = SaveHandler(os.path.join(cl_params.dir, "saves"))
    # else:
    save_handler = None

    if cl_params.load_from is not None:
        # tmp = cl_params.focus
        sys.stdout.info("Loading initial state from saves")
        cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(
            TokenReader(open(cl_params.load_from, "r")))
        cl_params.parse(args)
        # cl_params.focus = tmp
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor),
                             dot_plot)
        # extender = LineExtender(aligner, knotter, disjointigs, dot_plot, reads)
        dot_plot.printAll(sys.stdout)
        printState(lines, sys.stdout)
    else:
        aligner = Aligner(DirDistributor(cl_params.alignmentDir()))
        polisher = Polisher(aligner, aligner.dir_distributor)

        reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads,
                                     cl_params.downsample)

        if cl_params.contigs_file is None:
            sys.stdout.info("Running Flye")
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            subprocess.check_call([
                os.path.join(params.bin_path,
                             "flye"), "--meta", "-o", assembly_dir, "-t",
                str(cl_params.threads), "--" + params.technology + "-raw",
                reads_file, "--genome-size",
                str(cl_params.genome_size), "--min-overlap",
                str(params.k)
            ])
            cl_params.set_flye_dir(assembly_dir, cl_params.mode)
        elif len(cl_params.disjointigs_file_list
                 ) == 0 and not cl_params.precruiting:
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            disjointigs_file = constructDisjointigs(reads,
                                                    params.expected_size,
                                                    assembly_dir)
            # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir)
            cl_params.disjointigs_file_list.append(disjointigs_file)
            params.min_contra_for_break = 8

        if cl_params.precruiting:
            disjointigs = DisjointigCollection()
        else:
            disjointigs = CreateDisjointigCollection(
                cl_params.disjointigs_file_list, cl_params.dir, aligner, reads)

        if cl_params.debug and not cl_params.precruiting:
            df = open(os.path.join(cl_params.dir, "disjonting_als.txt"), "w")
            for disjointig in disjointigs:
                df.write(disjointig.id + "\n")
                for al in disjointig.read_alignments:
                    df.write(str(al) + "\n")
            df.close()

        all_unique = cl_params.init_file is not None
        contigs = CreateContigCollection(cl_params.graph_file,
                                         cl_params.contigs_file,
                                         cl_params.min_cov, aligner, polisher,
                                         reads, cl_params.force_unique,
                                         all_unique)

        if cl_params.autoKL:
            adjustKL(aligner, reads, contigs, cl_params.mink)

        if cl_params.init_file is None:
            ExtendShortContigs(contigs, reads, aligner, polisher,
                               cl_params.read_dump)
            lines = CreateLineCollection(cl_params.dir, aligner, contigs,
                                         disjointigs, reads, cl_params.split)
        else:
            lines = LoadLineCollection(cl_params.dir, cl_params.init_file,
                                       aligner, contigs, disjointigs, reads,
                                       polisher)

        sys.stdout.info("Constructing dot plot")
        dot_plot = LineDotPlot(lines, aligner)
        dot_plot.construct(aligner)
        # dot_plot.printAll(sys.stdout)
        if cl_params.precruiting:
            recruiter = PairwiseReadRecruiter(aligner, reads, lines)
            if params.debug:
                rec_dump = open(os.path.join(dir, "pairwise.info"), "w")
                recruiter.als.dump(rec_dump)
                rec_dump.close()
        else:
            recruiter = None

        sys.stdout.info("Updating sequences and resolved segments.")
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor),
                             dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot, reads,
                                recruiter)
        extender.updateAllStructures(
            itertools.chain.from_iterable(line.completely_resolved
                                          for line in lines))
        for line in list(lines.unique()):  # type: NewLine
            line.completely_resolved.mergeSegments()
            if len(line.completely_resolved) == 0:
                lines.removeLine(line)
        # if cl_params.debug:
        #     sys.stdout.info( "Saving initial state")
        #     try:
        #         writer = save_handler.getWriter()
        #         sys.stdout.info("Save details:", writer.info)
        #         saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot)
        #     except Exception as e:
        #         _, _, tb = sys.exc_info()
        #         sys.stdout.warn("Could not write save")
        #         traceback.print_tb(tb)
        #         sys.stdout.INFO( "Message:", e.message)

    sys.stdout.trace("Disjointig alignments")
    for line in lines:
        sys.stdout.trace(line.disjointig_alignments)
    sys.stdout.info("Starting expanding alignment-consensus loop")
    if cl_params.debug:
        debugger.debugger = debugger.Debugger(
            os.path.join(cl_params.dir, "debug"), lines, dot_plot, reads,
            aligner)
        params.debug = True

    EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines,
         reads, save_handler)

    # dot_plot.printAll(sys.stdout)

    sys.stdout.trace("Final result:")
    lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w"))
    lines.printKnottedToFasta(
        open(os.path.join(cl_params.dir, "assembly.fasta"), "w"))
    printState(lines, open(os.path.join(cl_params.dir, "lines.info"), "w"))
    secs = int(time.time() - start)
    days = secs / 60 / 60 / 24
    hours = secs / 60 / 60 % 24
    mins = secs / 60 % 60
    sys.stdout.info("Results can be found in",
                    os.path.join(cl_params.dir, "assembly.fasta"))
    sys.stdout.info("Finished in %d days, %d hours, %d minutes" %
                    (days, hours, mins))
    if cl_params.test:
        passed = False
        for al in aligner.dotplotAlign(lines, ref):
            if len(al) > len(al.seg_to.contig) - 3000:
                passed = True
                break
        if passed:
            sys.stdout.info("Test passed")
        else:
            sys.stdout.info("Test failed")