Beispiel #1
0
 def polishAndAnalyse(self, reads, polishing_base, reliable_start = None):
     # type: (ReadCollection, Contig, Optional[int]) -> Consensus
     if reliable_start is None:
         reliable_start = len(polishing_base)
     seq = Contig(self.polish(reads, polishing_base), "contig")
     res = [0] * (len(seq) + 1)
     alignment = ReadCollection().extendClean(reads)
     self.aligner.alignReadCollection(alignment, [seq])
     contra = 0
     ok = 0
     late = 0
     for read in alignment:
         for al in read.alignmentsTo(seq.asSegment()):# type: AlignmentPiece
             if al.contradicting(seq.asSegment()):
                 contra += 1
             elif al.seg_to.left > reliable_start:
                 late += 1
             else:
                 res[al.seg_to.left] += 1
                 res[al.seg_to.right] -= 1
                 ok += 1
     for i in range(1, len(res)):
         res[i] += res[i - 1]
     sys.stdout.trace("Polyshed and analysed using", len(alignment), "reads. Ok:", ok, "late:", late, "contra:", contra)
     # if contra > 10 or contra > ok / 2:
     #     for read in alignment:
     #         print read
     #         for al in read.alignmentsTo(seq.asSegment()):
     #             if al.contradictingRTC(seq.asSegment()):
     #                 print "contra_al:", al
     #             elif al.seg_to.left > reliable_start:
     #                 print "late_al:", al
     #             else:
     #                 print "ok_al:", al
     return Consensus(seq.seq, res)
Beispiel #2
0
 def testManual(self):
     contig1 = Contig("ACGTTAAACGT", "from")
     contig2 = Contig("ACGTTTAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings())
     contig1 = Contig("ACATGATCACT", "from")
     contig2 = Contig("ACGTGAAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
Beispiel #3
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "c1")
     contig2 = Contig("ACGTAGGTACGT", "c2")
     contig3 = Contig("ACTTACGTACGT", "c3")
     al1 = AlignmentPiece.Identical(contig1.asSegment(),
                                    contig2.asSegment())
     al2 = AlignmentPiece.Identical(contig2.asSegment(),
                                    contig3.asSegment())
     al3 = al1.compose(al2)
     assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al3.cigar == "12M"
     al4 = al1.reverse()
     al5 = al4.composeTargetDifference(al2)
     assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al5.cigar == "12M"
Beispiel #4
0
 def testManual(self):
     contig1 = Contig("ACGTACGTA", "from")
     contig2 = Contig("ACTACGTACGTACAT", "to")
     al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8),
                          "2M1I6M")
     al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15),
                          "8M")
     glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2])
     assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar
     assert glued.seg_from.Seq(
     ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar
     assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
Beispiel #5
0
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length):
    base = Contig(seq, "base")
    for i in range(len(seq) / min_contig_length):
        res = splitSegKmeans(
            aligner,
            base.segment(i * min_contig_length,
                         i * min_contig_length + min_contig_length), mult,
            all_reads_list)
        if res is not None:
            return res
    res = splitSegKmeans(
        aligner,
        base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult,
        all_reads_list)
    return res
Beispiel #6
0
 def test5(self):
     dataset = TestDataset("abcABC")
     name1 = dataset.addContig("abc")
     name2 = dataset.addContig("ABC")
     lines, dp, reads = dataset.genAll(self.aligner)
     line = lines[name1]
     sa = dataset.alphabet["a"].seq
     sb = dataset.alphabet["b"].seq
     tmp = Contig(
         sa +
         "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA"
         + sb, "tmp")
     al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)),
                                    line.prefix(len=len(sa)))
     al2 = AlignmentPiece.Identical(
         tmp.asSegment().suffix(length=len(sb)),
         line.segment(len(sa),
                      len(sa) + len(sb)))
     al = AlignmentPiece.MergeFittingAlignments([al1, al2])
     line.correctSequence([al])
     assert str(
         list(dp.allInter(line.asSegment()))
     ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
Beispiel #7
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
Beispiel #8
0
    def mergeLines(self, alignment, k):
        # type: (AlignmentPiece, int) -> NewLine
        sys.stdout.trace("Line operation Merge", alignment.seg_from.contig,
                         alignment.seg_to.contig, alignment)
        line1 = alignment.seg_from.contig  #type: NewLine
        line2 = alignment.seg_to.contig  #type: NewLine
        assert line1 != line2
        if len(alignment) < k + 100:
            sys.stdout.trace(
                "Prolonging line to ensure alignment of at least k")
            seg = line2.segment(
                alignment.seg_to.right,
                alignment.seg_to.right + k + 100 - len(alignment))
            line1.extendRight(seg.Seq())
            alignment = alignment.mergeDistant(
                AlignmentPiece.Identical(
                    line1.asSegment().suffix(length=len(seg)), seg))
        # Cutting hanging tips of both lines
        al_storage = AlignmentStorage()
        al_storage.add(alignment)
        storage = TwoLineAlignmentStorage(line1, line2)
        line2.addListener(storage)
        line1.addListener(storage.reverse)
        storage.add(alignment)
        if alignment.seg_from.right < len(line1):
            line1.cutRight(alignment.seg_from.right)
            sys.stdout.trace("Cut right")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        if alignment.seg_to.left > 0:
            line2.rc.cutRight(len(line2) - alignment.seg_to.left)
            sys.stdout.trace("Cut left")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        alignment = list(storage.content)[0]  # type: AlignmentPiece
        line2.removeListener(storage)
        line1.removeListener(storage.reverse)

        # Making sure line sequences match on the overlap
        if alignment.seg_from.left > 0:
            new_seq = Contig(
                line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() +
                line2.seq, "new_seq")
        else:
            new_seq = Contig(line2.seq, "new_seq")
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            new_seq.asSegment().suffix(length=len(line2)))
        sys.stdout.trace("Al2:", al2)
        alignment = alignment.compose(al2).reverse()
        sys.stdout.trace("Composed alignment", alignment)
        sys.stdout.trace("\n".join(alignment.asMatchingStrings()))
        sys.stdout.trace(alignment.cigar)
        assert alignment.seg_to.right == len(line1)
        assert alignment.seg_from.left == al2.seg_to.left
        line1.correctSequence([alignment])

        # Now lines have exact match
        name = "(" + ",".join(
            basic.parseLineName(line1.id) +
            basic.parseLineName(line2.id)) + ")"
        line = self.addNew(new_seq.seq, name)
        assert line.seq.startswith(line1.seq)
        assert line.seq.endswith(line2.seq)
        al1 = AlignmentPiece.Identical(
            line1.asSegment(),
            line.asSegment().prefix(length=len(line1)))
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            line.asSegment().suffix(length=len(line2)))

        line.initial.addAll(
            line1.initial.targetAsSegment(al1.seg_to).merge(
                line2.initial.targetAsSegment(al2.seg_to)))
        line.correct_segments.addAll(
            line1.correct_segments.contigAsSegment(al1.seg_to).merge(
                line2.correct_segments.contigAsSegment(al2.seg_to)))
        line.completely_resolved.addAll(
            line1.completely_resolved.contigAsSegment(al1.seg_to).merge(
                line2.completely_resolved.contigAsSegment(al2.seg_to), k))
        line.disjointig_alignments.addAll(
            line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge(
                line2.disjointig_alignments.targetAsSegment(al2.seg_to)))
        for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge(
                line2.read_alignments.targetAsSegment(al2.seg_to)):
            line.addReadAlignment(al)
        line1.cleanReadAlignments()
        line2.cleanReadAlignments()

        self.notifyMergedLines(al1, al2)
        knot_right = line2.knot
        knot_left = line1.rc.knot
        self.remove(line1)
        self.remove(line2)
        if knot_right is not None:
            if knot_right.line_right == line1:
                line.tie(line, knot_right.gap, knot_right.gap_seq)
            else:
                line.tie(knot_right.line_right, knot_right.gap,
                         knot_right.gap_seq)
        if knot_left is not None and knot_left.line_right != line2.rc:
            line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq)
        return line