Example #1
0
 def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000):
     # type: (ContigStorage, int, int) -> None
     for contig in contigs.unique():
         if not basic.isCanonocal(contig.id):
             contig = contig.rc
         if len(contig) > max_contig:
             line1 = self.addNew(contig.seq[:cut_size],
                                 "L" + contig.id + "l")
             line2 = self.addNew(contig.seq[-cut_size:],
                                 "L" + contig.id + "r")
             line1.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().prefix(length=cut_size),
                     line1.asSegment()))
             line2.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().suffix(length=cut_size),
                     line2.asSegment()))
             line1.tie(line2,
                       len(contig) - 2 * cut_size,
                       contig.seq[cut_size:-cut_size])
         else:
             line = self.addNew(contig.seq, "L" + contig.id)
             line.initial.add(
                 AlignmentPiece.Identical(contig.asSegment(),
                                          line.asSegment()))
Example #2
0
    def splitLine(self, seg):
        # type: (Segment) -> Tuple[NewLine, NewLine]
        sys.stdout.trace("Line operation Split", seg)
        line = seg.contig  # type: NewLine
        seg1 = line.asSegment().prefix(pos=seg.right)
        line1 = self.addNew(seg1.Seq(), line.id + "l")
        seg2 = line.asSegment().suffix(pos=seg.left)
        line2 = self.addNew(seg2.Seq(), line.id + "r")
        al1 = AlignmentPiece.Identical(seg1, line1.asSegment())
        al2 = AlignmentPiece.Identical(seg2, line2.asSegment())
        line1.initial.addAll([
            al.embed(al1)
            for al in line.initial.allInter(seg1, params.min_alignment_size)
        ])
        line2.initial.addAll([
            al.embed(al2)
            for al in line.initial.allInter(seg2, params.min_alignment_size)
        ])
        line1.correct_segments.addAll(
            line.correct_segments.cap(seg=seg1, min_inter=params.k).map(al1))
        line2.correct_segments.addAll(
            line.correct_segments.cap(seg=seg2, min_inter=params.k).map(al2))
        line1.completely_resolved.addAll(
            line.completely_resolved.cap(
                seg=seg1,
                min_inter=params.k).map(al1).filterBySize(min=params.k))
        line2.completely_resolved.addAll(
            line.completely_resolved.cap(
                seg=seg2,
                min_inter=params.k).map(al2).filterBySize(min=params.k))

        line1.disjointig_alignments.addAll([
            al.embed(al1)
            for al in line.disjointig_alignments.allInter(seg1, params.k)
        ])
        line2.disjointig_alignments.addAll([
            al.embed(al2)
            for al in line.disjointig_alignments.allInter(seg2, params.k)
        ])
        for al in line.read_alignments:
            if al.seg_to.interSize(seg1) > params.k:
                line1.addReadAlignment(al.embed(al1))
        for al in line.read_alignments:
            if al.seg_to.interSize(seg2) > params.k:
                line2.addReadAlignment(al.embed(al2))
        line.cleanReadAlignments()
        self.notifySplitLine(al1, al2)
        self.remove(line)
        if line.knot is not None:
            line2.tie(line.knot.line_right, line.knot.gap, line.knot.gap_seq)
        if line.rc.knot is not None:
            line1.rc.tie(line.rc.knot.line_right, line.rc.knot.gap,
                         line.rc.knot.gap_seq)
        return line1, line2
Example #3
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
Example #4
0
 def testManual(self):
     contig1 = Contig("ACGTTAAACGT", "from")
     contig2 = Contig("ACGTTTAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings())
     contig1 = Contig("ACATGATCACT", "from")
     contig2 = Contig("ACGTGAAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
Example #5
0
 def test3(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line = lines.addNew("ACGTACGTACGT", "c")
     dp = LineDotPlot(lines, self.aligner)
     al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12))
     al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12))
     dp.addAlignment(al1)
     dp.addAlignment(al2)
     alignment = AlignmentPiece.Identical(
         Contig("TCC", "tmp").asSegment(), line.segment(3, 6))
     line.correctSequence([alignment])
     assert str(
         list(dp.auto_alignments["c"])
     ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
Example #6
0
 def test1(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1")
     line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()),
         params.alignment_correction_radius)
     dp = LineDotPlot(lines, self.aligner)
     dp.addAlignment(al)
     alignment = AlignmentPiece.Identical(
         Contig("AGG", "tmp").asSegment(), line2.segment(0, 3))
     line2.correctSequence([alignment])
     assert str(list(dp.alignmentsToFrom[line2.id][
         line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
Example #7
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "c1")
     contig2 = Contig("ACGTAGGTACGT", "c2")
     contig3 = Contig("ACTTACGTACGT", "c3")
     al1 = AlignmentPiece.Identical(contig1.asSegment(),
                                    contig2.asSegment())
     al2 = AlignmentPiece.Identical(contig2.asSegment(),
                                    contig3.asSegment())
     al3 = al1.compose(al2)
     assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al3.cigar == "12M"
     al4 = al1.reverse()
     al5 = al4.composeTargetDifference(al2)
     assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al5.cigar == "12M"
Example #8
0
 def __iter__(self):
     # type: () -> Generator[AlignmentPiece]
     for al in self.content:
         yield al
     for al in self.content:
         yield al.reverse()
     yield AlignmentPiece.Identical(self.line.asSegment())
Example #9
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])):
            if len(al.seg_to) >= min(params.k, len(line) - 100):
                tmp_line = al.seg_to.contig # type: NewLine
                tmp_line.addReadAlignment(al)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
        line.correct_segments.add(line.asSegment().shrink(100))
        line.completely_resolved.add(line.asSegment().shrink(100))
        line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
Example #10
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "from")
     contig2 = Contig("ACGTACGTACGT", "to")
     al1 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(0, 4))
     al2 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(4, 8))
     al3 = AlignmentPiece.Identical(contig1.segment(4, 8),
                                    contig2.segment(8, 12))
     storage = AlignmentStorage()
     storage.addAll([al1, al2, al3])
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     assert str(
         list(storage.rc)
     ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]"
     assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]"
     assert str(list(storage.filterByCoverage(0, 1))) == "[]"
     assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]"
     assert str(list(storage.filterByCoverage(2))) == "[]"
     storage.addAndMergeRight(al3)
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     al4 = AlignmentPiece.Identical(contig1.segment(2, 8),
                                    contig2.segment(2, 8))
     al5 = AlignmentPiece.Identical(contig1.segment(4, 10),
                                    contig2.segment(4, 10))
     storage.addAll([al4, al5])
     assert str(
         list(storage.calculateCoverage())
     ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]"
     assert str(list(storage.filterByCoverage(
         2, 3))) == "[to[2:4], to[8:12-2]]"
     assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]"
     assert str(
         list(storage.getAlignmentsTo(contig2.segment(2, 3)))
     ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]"
     assert str(list(storage.getAlignmentsTo(contig2.segment(
         2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
Example #11
0
 def testManual(self):
     contig1 = Contig("ACGTAAAAGGGTACGT", "c1")
     contig2 = Contig("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(contig1.segment(5, 12),
                                  contig2.segment(5, 12)),
         params.alignment_correction_radius)
     corr = Correction(contig1, contig2, [al])
     assert corr.mapPositionsUp(range(len(contig2))) == [
         0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15
     ]
     assert corr.mapPositionsDown(range(len(contig1))) == [
         0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15
     ]
     al2 = AlignmentPiece.Identical(contig2.segment(0, 4))
     al3 = AlignmentPiece.Identical(contig2.segment(6, 8))
     al4 = AlignmentPiece.Identical(contig2.segment(6, 16))
     al5 = AlignmentPiece.Identical(contig2.segment(7, 16))
     assert str(
         corr.composeQueryDifferences([al2, al3, al4, al5])
     ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
Example #12
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads,
                       polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        als = []
        line_reads = [reads[rid] for rid in read_ids]
        if len(line_reads) == 0:
            sys.stdout.warn("No read alignments in initialization for line",
                            line.id, "Realigning all reads")
            line_reads = reads
        for al in aligner.overlapAlign(line_reads, ContigStorage([line])):
            if len(al.seg_to) >= min(1500, len(line) - 100):
                als.append(al)
        als = sorted(als,
                     key=lambda al: (al.seg_from.contig.id, -int(
                         al.percentIdentity() * 100), -len(al)))
        for key, read_als in itertools.groupby(
                als, key=lambda al: al.seg_from.contig.id):
            al = list(read_als)[0]
            tmp_line = al.seg_to.contig  # type: NewLine
            tmp_line.addReadAlignment(al)
        correct_seg = line.asSegment().shrink(100)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(
                list(line.read_alignments),
                max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
            if len(correct_seg) < params.k:
                correct_seg = correct_seg.expandRight(params.k -
                                                      len(correct_seg))
        line.correct_segments.add(correct_seg)
        line.completely_resolved.add(correct_seg)
        line.initial.add(
            AlignmentPiece.Identical(line.asSegment().asContig().asSegment(),
                                     line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
Example #13
0
 def test5(self):
     dataset = TestDataset("abcABC")
     name1 = dataset.addContig("abc")
     name2 = dataset.addContig("ABC")
     lines, dp, reads = dataset.genAll(self.aligner)
     line = lines[name1]
     sa = dataset.alphabet["a"].seq
     sb = dataset.alphabet["b"].seq
     tmp = Contig(
         sa +
         "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA"
         + sb, "tmp")
     al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)),
                                    line.prefix(len=len(sa)))
     al2 = AlignmentPiece.Identical(
         tmp.asSegment().suffix(length=len(sb)),
         line.segment(len(sa),
                      len(sa) + len(sb)))
     al = AlignmentPiece.MergeFittingAlignments([al1, al2])
     line.correctSequence([al])
     assert str(
         list(dp.allInter(line.asSegment()))
     ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
Example #14
0
 def loadLine(self, handler, disjointigs, reads, contigs):
     # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None
     self.id = handler.readToken()
     self.seq = handler.readToken()
     self.rc.id = basic.Reverse(self.id)
     n = handler.readInt()
     for i in range(n):
         handler.readToken()
         handler.readToken()
         handler.readToken()
         seg = Segment.load(handler, self)
         handler.readToken()
         self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg))
         # self.add(AlignmentPiece.load(handler, collection_from, collection_to))
     self.correct_segments.load(handler, self)
     self.completely_resolved.load(handler, self)
     self.disjointig_alignments.load(handler, disjointigs, self)
     self.read_alignments.load(handler, reads, self)
     for al in self.read_alignments:
         read = al.seg_from.contig #type: AlignedRead
         read.addAlignment(al)
     self.max_extension = False
 def genAll(self, aligner):
     # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
     disjointigs = DisjointigCollection()
     for dis in self.disjointigs:
         disjointigs.addNew(dis.seq, dis.id)
     from disjointig_resolve.line_storage import NewLineStorage
     lines = NewLineStorage(disjointigs, aligner)
     lines.name_printer = lambda line: line.id + "_" + self.translateBack(
         line, aligner)
     for line in self.contigs:
         new_line = lines.addNew(line.seq, line.id)
         new_line.initial.add(
             AlignmentPiece.Identical(
                 new_line.asSegment().asContig().asSegment(),
                 new_line.asSegment()))
     dp = LineDotPlot(lines, aligner)
     dp.construct(aligner)
     lines.alignDisjointigs()
     reads = ReadCollection()
     for read in self.reads:
         reads.addNewRead(read)
     disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
     return lines, dp, reads
Example #16
0
    def mergeLines(self, alignment, k):
        # type: (AlignmentPiece, int) -> NewLine
        sys.stdout.trace("Line operation Merge", alignment.seg_from.contig,
                         alignment.seg_to.contig, alignment)
        line1 = alignment.seg_from.contig  #type: NewLine
        line2 = alignment.seg_to.contig  #type: NewLine
        assert line1 != line2
        if len(alignment) < k + 100:
            sys.stdout.trace(
                "Prolonging line to ensure alignment of at least k")
            seg = line2.segment(
                alignment.seg_to.right,
                alignment.seg_to.right + k + 100 - len(alignment))
            line1.extendRight(seg.Seq())
            alignment = alignment.mergeDistant(
                AlignmentPiece.Identical(
                    line1.asSegment().suffix(length=len(seg)), seg))
        # Cutting hanging tips of both lines
        al_storage = AlignmentStorage()
        al_storage.add(alignment)
        storage = TwoLineAlignmentStorage(line1, line2)
        line2.addListener(storage)
        line1.addListener(storage.reverse)
        storage.add(alignment)
        if alignment.seg_from.right < len(line1):
            line1.cutRight(alignment.seg_from.right)
            sys.stdout.trace("Cut right")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        if alignment.seg_to.left > 0:
            line2.rc.cutRight(len(line2) - alignment.seg_to.left)
            sys.stdout.trace("Cut left")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        alignment = list(storage.content)[0]  # type: AlignmentPiece
        line2.removeListener(storage)
        line1.removeListener(storage.reverse)

        # Making sure line sequences match on the overlap
        if alignment.seg_from.left > 0:
            new_seq = Contig(
                line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() +
                line2.seq, "new_seq")
        else:
            new_seq = Contig(line2.seq, "new_seq")
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            new_seq.asSegment().suffix(length=len(line2)))
        sys.stdout.trace("Al2:", al2)
        alignment = alignment.compose(al2).reverse()
        sys.stdout.trace("Composed alignment", alignment)
        sys.stdout.trace("\n".join(alignment.asMatchingStrings()))
        sys.stdout.trace(alignment.cigar)
        assert alignment.seg_to.right == len(line1)
        assert alignment.seg_from.left == al2.seg_to.left
        line1.correctSequence([alignment])

        # Now lines have exact match
        name = "(" + ",".join(
            basic.parseLineName(line1.id) +
            basic.parseLineName(line2.id)) + ")"
        line = self.addNew(new_seq.seq, name)
        assert line.seq.startswith(line1.seq)
        assert line.seq.endswith(line2.seq)
        al1 = AlignmentPiece.Identical(
            line1.asSegment(),
            line.asSegment().prefix(length=len(line1)))
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            line.asSegment().suffix(length=len(line2)))

        line.initial.addAll(
            line1.initial.targetAsSegment(al1.seg_to).merge(
                line2.initial.targetAsSegment(al2.seg_to)))
        line.correct_segments.addAll(
            line1.correct_segments.contigAsSegment(al1.seg_to).merge(
                line2.correct_segments.contigAsSegment(al2.seg_to)))
        line.completely_resolved.addAll(
            line1.completely_resolved.contigAsSegment(al1.seg_to).merge(
                line2.completely_resolved.contigAsSegment(al2.seg_to), k))
        line.disjointig_alignments.addAll(
            line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge(
                line2.disjointig_alignments.targetAsSegment(al2.seg_to)))
        for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge(
                line2.read_alignments.targetAsSegment(al2.seg_to)):
            line.addReadAlignment(al)
        line1.cleanReadAlignments()
        line2.cleanReadAlignments()

        self.notifyMergedLines(al1, al2)
        knot_right = line2.knot
        knot_left = line1.rc.knot
        self.remove(line1)
        self.remove(line2)
        if knot_right is not None:
            if knot_right.line_right == line1:
                line.tie(line, knot_right.gap, knot_right.gap_seq)
            else:
                line.tie(knot_right.line_right, knot_right.gap,
                         knot_right.gap_seq)
        if knot_left is not None and knot_left.line_right != line2.rc:
            line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq)
        return line
Example #17
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
Example #18
0
 def fillFromContigs(self, contigs):
     # type: (Iterable[Contig]) -> None
     for contig in UniqueList(contigs):
         line = self.addNew(contig.seq, "L" + contig.id)
         line.initial.add(
             AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))