Esempio n. 1
0
 def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000):
     # type: (ContigStorage, int, int) -> None
     for contig in contigs.unique():
         if not basic.isCanonocal(contig.id):
             contig = contig.rc
         if len(contig) > max_contig:
             line1 = self.addNew(contig.seq[:cut_size],
                                 "L" + contig.id + "l")
             line2 = self.addNew(contig.seq[-cut_size:],
                                 "L" + contig.id + "r")
             line1.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().prefix(length=cut_size),
                     line1.asSegment()))
             line2.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().suffix(length=cut_size),
                     line2.asSegment()))
             line1.tie(line2,
                       len(contig) - 2 * cut_size,
                       contig.seq[cut_size:-cut_size])
         else:
             line = self.addNew(contig.seq, "L" + contig.id)
             line.initial.add(
                 AlignmentPiece.Identical(contig.asSegment(),
                                          line.asSegment()))
Esempio n. 2
0
    def splitLine(self, seg):
        # type: (Segment) -> Tuple[NewLine, NewLine]
        sys.stdout.trace("Line operation Split", seg)
        line = seg.contig  # type: NewLine
        seg1 = line.asSegment().prefix(pos=seg.right)
        line1 = self.addNew(seg1.Seq(), line.id + "l")
        seg2 = line.asSegment().suffix(pos=seg.left)
        line2 = self.addNew(seg2.Seq(), line.id + "r")
        al1 = AlignmentPiece.Identical(seg1, line1.asSegment())
        al2 = AlignmentPiece.Identical(seg2, line2.asSegment())
        line1.initial.addAll([
            al.embed(al1)
            for al in line.initial.allInter(seg1, params.min_alignment_size)
        ])
        line2.initial.addAll([
            al.embed(al2)
            for al in line.initial.allInter(seg2, params.min_alignment_size)
        ])
        line1.correct_segments.addAll(
            line.correct_segments.cap(seg=seg1, min_inter=params.k).map(al1))
        line2.correct_segments.addAll(
            line.correct_segments.cap(seg=seg2, min_inter=params.k).map(al2))
        line1.completely_resolved.addAll(
            line.completely_resolved.cap(
                seg=seg1,
                min_inter=params.k).map(al1).filterBySize(min=params.k))
        line2.completely_resolved.addAll(
            line.completely_resolved.cap(
                seg=seg2,
                min_inter=params.k).map(al2).filterBySize(min=params.k))

        line1.disjointig_alignments.addAll([
            al.embed(al1)
            for al in line.disjointig_alignments.allInter(seg1, params.k)
        ])
        line2.disjointig_alignments.addAll([
            al.embed(al2)
            for al in line.disjointig_alignments.allInter(seg2, params.k)
        ])
        for al in line.read_alignments:
            if al.seg_to.interSize(seg1) > params.k:
                line1.addReadAlignment(al.embed(al1))
        for al in line.read_alignments:
            if al.seg_to.interSize(seg2) > params.k:
                line2.addReadAlignment(al.embed(al2))
        line.cleanReadAlignments()
        self.notifySplitLine(al1, al2)
        self.remove(line)
        if line.knot is not None:
            line2.tie(line.knot.line_right, line.knot.gap, line.knot.gap_seq)
        if line.rc.knot is not None:
            line1.rc.tie(line.rc.knot.line_right, line.rc.knot.gap,
                         line.rc.knot.gap_seq)
        return line1, line2
Esempio n. 3
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
Esempio n. 4
0
 def testManual(self):
     contig1 = Contig("ACGTTAAACGT", "from")
     contig2 = Contig("ACGTTTAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings())
     contig1 = Contig("ACATGATCACT", "from")
     contig2 = Contig("ACGTGAAACGT", "to")
     al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment())
     al1 = self.scorer.polyshAlignment(al,
                                       params.alignment_correction_radius)
     assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
Esempio n. 5
0
 def test1(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1")
     line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()),
         params.alignment_correction_radius)
     dp = LineDotPlot(lines, self.aligner)
     dp.addAlignment(al)
     alignment = AlignmentPiece.Identical(
         Contig("AGG", "tmp").asSegment(), line2.segment(0, 3))
     line2.correctSequence([alignment])
     assert str(list(dp.alignmentsToFrom[line2.id][
         line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
Esempio n. 6
0
 def test3(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line = lines.addNew("ACGTACGTACGT", "c")
     dp = LineDotPlot(lines, self.aligner)
     al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12))
     al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12))
     dp.addAlignment(al1)
     dp.addAlignment(al2)
     alignment = AlignmentPiece.Identical(
         Contig("TCC", "tmp").asSegment(), line.segment(3, 6))
     line.correctSequence([alignment])
     assert str(
         list(dp.auto_alignments["c"])
     ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
Esempio n. 7
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "c1")
     contig2 = Contig("ACGTAGGTACGT", "c2")
     contig3 = Contig("ACTTACGTACGT", "c3")
     al1 = AlignmentPiece.Identical(contig1.asSegment(),
                                    contig2.asSegment())
     al2 = AlignmentPiece.Identical(contig2.asSegment(),
                                    contig3.asSegment())
     al3 = al1.compose(al2)
     assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al3.cigar == "12M"
     al4 = al1.reverse()
     al5 = al4.composeTargetDifference(al2)
     assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al5.cigar == "12M"
Esempio n. 8
0
 def merge(self, other):
     # type: (AlignmentStorage) -> AlignmentStorage
     left_items = [(al, -1) for al in self]
     right_items = [(al, 1) for al in other]
     new_items = sorted(left_items + right_items, key = lambda (al, side): (al.seg_to.contig.id, al.seg_from.contig.id, al.seg_from.left))
     new_storge = AlignmentStorage()
     for (c_to, c_from), it in itertools.groupby(new_items, lambda al: (al[0].seg_to.contig, al[0].seg_from.contig)):
         al_sides = list(it)
         als_left = [al for al, side in al_sides if side == -1] # type: List[AlignmentPiece]
         als_left = sorted(als_left, key = lambda al: al.seg_from.right)
         als_right = [al for al, side in al_sides if side == 1] # type: List[AlignmentPiece]
         als_right = sorted(als_right, key = lambda al: al.seg_from.left)
         merged = []
         for al in als_left:
             for j in range(len(als_right)): # type: int
                 if als_right[j] is None:
                     continue
                 if als_right[j].seg_from.left >= al.seg_from.right:
                     break
                 if als_right[j] is not None and al.canMergeTo(als_right[j]):
                     tmp = AlignmentPiece.MergeOverlappingAlignments([al, als_right[j]])
                     if tmp is not None:
                         al = tmp
                         als_right[j] = None
                         break
             merged.append(al)
         new_storge.addAll(merged)
         new_storge.addAll([al for al in als_right if al is not None])
     return new_storge
Esempio n. 9
0
 def polishSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     sys.stdout.trace("Polishing segment", seg)
     w = max(900, params.k)
     r = 50
     first = seg.left / w
     last = min(seg.right + w - 1, len(seg.contig)) / w
     segs = []
     for i in range(first, last + 1):
         segs.append(
             Segment(seg.contig, max(0, i * w - r),
                     min(len(seg.contig), (i + 1) * w + r)))
     als_by_segment = [[] for i in range(last - first + 1)]
     for al in als:
         l = al.seg_to.left / w
         r = al.seg_to.right / w + 1
         for i in range(max(0, l - first - 1),
                        min(last - first + 1, r - first + 2)):
             if al.seg_to.inter(segs[i]):
                 als_by_segment[i].append(al)
     res_als = []
     for seg1, seg_als in zip(segs, als_by_segment):
         if seg1.inter(seg):
             res_als.append(self.polishSmallSegment(seg1, seg_als))
     res = AlignmentPiece.GlueOverlappingAlignments(res_als)
     return res.reduce(target=seg)
Esempio n. 10
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])):
            if len(al.seg_to) >= min(params.k, len(line) - 100):
                tmp_line = al.seg_to.contig # type: NewLine
                tmp_line.addReadAlignment(al)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
        line.correct_segments.add(line.asSegment().shrink(100))
        line.completely_resolved.add(line.asSegment().shrink(100))
        line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
Esempio n. 11
0
 def __iter__(self):
     # type: () -> Generator[AlignmentPiece]
     for al in self.content:
         yield al
     for al in self.content:
         yield al.reverse()
     yield AlignmentPiece.Identical(self.line.asSegment())
Esempio n. 12
0
 def alignAndFilter(self, reads, ref_storage, mode):
     # type: (Iterable[Contig], ContigStorage, str) -> Generator[AlignmentPiece]
     filter = self.filters[mode]
     read_storage = ContigStorage(reads, False)
     als = []
     for rec in self.align(read_storage, list(ref_storage.unique()), mode):
         if rec.is_unmapped:
             continue
         if len(als) > 0 and rec.query_name != als[0].seg_from.contig.id:
             res = list(filter(als))
             for al in res:
                 yield al
             als = []
         if len(als) > 0:
             seq_from = als[0].seg_from.contig
         else:
             seq_from = read_storage[rec.query_name]
         seq_to = ref_storage[rec.tname]
         tmp = AlignmentPiece.FromSamRecord(seq_from, seq_to, rec)
         if tmp is not None:
             if mode == "dotplot":
                 als.extend(tmp.splitRef())
             elif (mode == "local"):
                 als.extend(tmp.splitRead())
             elif (mode == "ava-pb"):
                 als.extend(tmp.splitRead())
             else:
                 als.append(tmp)
     if len(als) > 0:
         res = list(filter(als))
         for al in res:
             yield al
Esempio n. 13
0
def printAlignments(sam_handler, reference_handler, reads_handler):
    print "Loading reference"
    cc = ContigStorage(add_rc=False).loadFromFasta(reference_handler, False)
    print "Loading query"
    reads = ContigStorage().loadFromFasta(reads_handler, False)
    print "Loading result"
    res = []
    for rec in sam_parser.Samfile(sam_handler):
        if rec.query_name in reads.items and cc[rec.tname] is not None:
            al = AlignmentPiece.FromSamRecord(reads[rec.query_name], cc[rec.tname], rec)
            if al is None:
                print rec.query_name, rec.tname
                continue
            if al.seg_to.contig not in cc:
                al = al.rc
            res.append(al)
    print "Printing result", len(res)
    res = sorted(res, key = lambda al: al.seg_to.left)
#    res = sorted(res, key = lambda al: len(al))[::-1]
    up = 0
    down = 0
    for al in res:
        print al
        print list(al.splitRead())
        s1, s2 = al.asMatchingStrings()
        up += s1.count("-")
        down += s2.count("-")
        s = []
        if len(list(al.splitRead())) > 1:
            nums = []
            for al1 in al.splitRead():
                nums.append(al1.seg_from.left)
                nums.append(al1.seg_from.right - 1)
            cur_num = 0
            cur = al.seg_from.left

            for c in s1:
                if cur == nums[cur_num] and c != "-":
                    if cur_num % 2 == 0:
                        s.append("[")
                    else:
                        s.append("]")
                    cur_num += 1
                else:
                    if cur_num % 2 == 0:
                        s.append("-")
                    else:
                        s.append("+")
                if c != "-":
                    cur += 1
            print "".join(s)
        print s1
        print s2
    print up, down
Esempio n. 14
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "from")
     contig2 = Contig("ACGTACGTACGT", "to")
     al1 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(0, 4))
     al2 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(4, 8))
     al3 = AlignmentPiece.Identical(contig1.segment(4, 8),
                                    contig2.segment(8, 12))
     storage = AlignmentStorage()
     storage.addAll([al1, al2, al3])
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     assert str(
         list(storage.rc)
     ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]"
     assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]"
     assert str(list(storage.filterByCoverage(0, 1))) == "[]"
     assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]"
     assert str(list(storage.filterByCoverage(2))) == "[]"
     storage.addAndMergeRight(al3)
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     al4 = AlignmentPiece.Identical(contig1.segment(2, 8),
                                    contig2.segment(2, 8))
     al5 = AlignmentPiece.Identical(contig1.segment(4, 10),
                                    contig2.segment(4, 10))
     storage.addAll([al4, al5])
     assert str(
         list(storage.calculateCoverage())
     ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]"
     assert str(list(storage.filterByCoverage(
         2, 3))) == "[to[2:4], to[8:12-2]]"
     assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]"
     assert str(
         list(storage.getAlignmentsTo(contig2.segment(2, 3)))
     ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]"
     assert str(list(storage.getAlignmentsTo(contig2.segment(
         2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
Esempio n. 15
0
 def addAndMergeLeft(self, al):
     # type: (AlignmentPiece) -> None
     if self.isCanonical():
         for i, al1 in enumerate(self.items): # type: int, AlignmentPiece
             if al.seg_from.inter(al1.seg_from) and al.seg_to.inter(al1.seg_to) and al1.seg_from.left >= al.seg_from.left:
                 tmp = AlignmentPiece.MergeOverlappingAlignments([al, al1])
                 if tmp is not None:
                     self.items[i] = tmp
                     return
         self.add(al)
     else:
         self.rc.addAndMergeRight(al.rc)
Esempio n. 16
0
 def testManual(self):
     contig1 = Contig("ACGTAAAAGGGTACGT", "c1")
     contig2 = Contig("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(contig1.segment(5, 12),
                                  contig2.segment(5, 12)),
         params.alignment_correction_radius)
     corr = Correction(contig1, contig2, [al])
     assert corr.mapPositionsUp(range(len(contig2))) == [
         0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15
     ]
     assert corr.mapPositionsDown(range(len(contig1))) == [
         0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15
     ]
     al2 = AlignmentPiece.Identical(contig2.segment(0, 4))
     al3 = AlignmentPiece.Identical(contig2.segment(6, 8))
     al4 = AlignmentPiece.Identical(contig2.segment(6, 16))
     al5 = AlignmentPiece.Identical(contig2.segment(7, 16))
     assert str(
         corr.composeQueryDifferences([al2, al3, al4, al5])
     ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
Esempio n. 17
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads,
                       polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        als = []
        line_reads = [reads[rid] for rid in read_ids]
        if len(line_reads) == 0:
            sys.stdout.warn("No read alignments in initialization for line",
                            line.id, "Realigning all reads")
            line_reads = reads
        for al in aligner.overlapAlign(line_reads, ContigStorage([line])):
            if len(al.seg_to) >= min(1500, len(line) - 100):
                als.append(al)
        als = sorted(als,
                     key=lambda al: (al.seg_from.contig.id, -int(
                         al.percentIdentity() * 100), -len(al)))
        for key, read_als in itertools.groupby(
                als, key=lambda al: al.seg_from.contig.id):
            al = list(read_als)[0]
            tmp_line = al.seg_to.contig  # type: NewLine
            tmp_line.addReadAlignment(al)
        correct_seg = line.asSegment().shrink(100)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(
                list(line.read_alignments),
                max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
            if len(correct_seg) < params.k:
                correct_seg = correct_seg.expandRight(params.k -
                                                      len(correct_seg))
        line.correct_segments.add(correct_seg)
        line.completely_resolved.add(correct_seg)
        line.initial.add(
            AlignmentPiece.Identical(line.asSegment().asContig().asSegment(),
                                     line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
Esempio n. 18
0
 def test5(self):
     dataset = TestDataset("abcABC")
     name1 = dataset.addContig("abc")
     name2 = dataset.addContig("ABC")
     lines, dp, reads = dataset.genAll(self.aligner)
     line = lines[name1]
     sa = dataset.alphabet["a"].seq
     sb = dataset.alphabet["b"].seq
     tmp = Contig(
         sa +
         "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA"
         + sb, "tmp")
     al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)),
                                    line.prefix(len=len(sa)))
     al2 = AlignmentPiece.Identical(
         tmp.asSegment().suffix(length=len(sb)),
         line.segment(len(sa),
                      len(sa) + len(sb)))
     al = AlignmentPiece.MergeFittingAlignments([al1, al2])
     line.correctSequence([al])
     assert str(
         list(dp.allInter(line.asSegment()))
     ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
Esempio n. 19
0
 def testManual(self):
     contig1 = Contig("ACGTACGTA", "from")
     contig2 = Contig("ACTACGTACGTACAT", "to")
     al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8),
                          "2M1I6M")
     al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15),
                          "8M")
     glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2])
     assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar
     assert glued.seg_from.Seq(
     ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar
     assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
Esempio n. 20
0
 def loadLine(self, handler, disjointigs, reads, contigs):
     # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None
     self.id = handler.readToken()
     self.seq = handler.readToken()
     self.rc.id = basic.Reverse(self.id)
     n = handler.readInt()
     for i in range(n):
         handler.readToken()
         handler.readToken()
         handler.readToken()
         seg = Segment.load(handler, self)
         handler.readToken()
         self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg))
         # self.add(AlignmentPiece.load(handler, collection_from, collection_to))
     self.correct_segments.load(handler, self)
     self.completely_resolved.load(handler, self)
     self.disjointig_alignments.load(handler, disjointigs, self)
     self.read_alignments.load(handler, reads, self)
     for al in self.read_alignments:
         read = al.seg_from.contig #type: AlignedRead
         read.addAlignment(al)
     self.max_extension = False
Esempio n. 21
0
 def genAll(self, aligner):
     # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
     disjointigs = DisjointigCollection()
     for dis in self.disjointigs:
         disjointigs.addNew(dis.seq, dis.id)
     from disjointig_resolve.line_storage import NewLineStorage
     lines = NewLineStorage(disjointigs, aligner)
     lines.name_printer = lambda line: line.id + "_" + self.translateBack(
         line, aligner)
     for line in self.contigs:
         new_line = lines.addNew(line.seq, line.id)
         new_line.initial.add(
             AlignmentPiece.Identical(
                 new_line.asSegment().asContig().asSegment(),
                 new_line.asSegment()))
     dp = LineDotPlot(lines, aligner)
     dp.construct(aligner)
     lines.alignDisjointigs()
     reads = ReadCollection()
     for read in self.reads:
         reads.addNewRead(read)
     disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
     return lines, dp, reads
Esempio n. 22
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
Esempio n. 23
0
 def fillFromContigs(self, contigs):
     # type: (Iterable[Contig]) -> None
     for contig in UniqueList(contigs):
         line = self.addNew(contig.seq, "L" + contig.id)
         line.initial.add(
             AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))
Esempio n. 24
0
    def mergeLines(self, alignment, k):
        # type: (AlignmentPiece, int) -> NewLine
        sys.stdout.trace("Line operation Merge", alignment.seg_from.contig,
                         alignment.seg_to.contig, alignment)
        line1 = alignment.seg_from.contig  #type: NewLine
        line2 = alignment.seg_to.contig  #type: NewLine
        assert line1 != line2
        if len(alignment) < k + 100:
            sys.stdout.trace(
                "Prolonging line to ensure alignment of at least k")
            seg = line2.segment(
                alignment.seg_to.right,
                alignment.seg_to.right + k + 100 - len(alignment))
            line1.extendRight(seg.Seq())
            alignment = alignment.mergeDistant(
                AlignmentPiece.Identical(
                    line1.asSegment().suffix(length=len(seg)), seg))
        # Cutting hanging tips of both lines
        al_storage = AlignmentStorage()
        al_storage.add(alignment)
        storage = TwoLineAlignmentStorage(line1, line2)
        line2.addListener(storage)
        line1.addListener(storage.reverse)
        storage.add(alignment)
        if alignment.seg_from.right < len(line1):
            line1.cutRight(alignment.seg_from.right)
            sys.stdout.trace("Cut right")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        if alignment.seg_to.left > 0:
            line2.rc.cutRight(len(line2) - alignment.seg_to.left)
            sys.stdout.trace("Cut left")
            sys.stdout.trace(list(storage.content)[0])
            sys.stdout.trace("\n".join(
                list(storage.content)[0].asMatchingStrings()))
            sys.stdout.trace(list(storage.content)[0].cigar)
        alignment = list(storage.content)[0]  # type: AlignmentPiece
        line2.removeListener(storage)
        line1.removeListener(storage.reverse)

        # Making sure line sequences match on the overlap
        if alignment.seg_from.left > 0:
            new_seq = Contig(
                line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() +
                line2.seq, "new_seq")
        else:
            new_seq = Contig(line2.seq, "new_seq")
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            new_seq.asSegment().suffix(length=len(line2)))
        sys.stdout.trace("Al2:", al2)
        alignment = alignment.compose(al2).reverse()
        sys.stdout.trace("Composed alignment", alignment)
        sys.stdout.trace("\n".join(alignment.asMatchingStrings()))
        sys.stdout.trace(alignment.cigar)
        assert alignment.seg_to.right == len(line1)
        assert alignment.seg_from.left == al2.seg_to.left
        line1.correctSequence([alignment])

        # Now lines have exact match
        name = "(" + ",".join(
            basic.parseLineName(line1.id) +
            basic.parseLineName(line2.id)) + ")"
        line = self.addNew(new_seq.seq, name)
        assert line.seq.startswith(line1.seq)
        assert line.seq.endswith(line2.seq)
        al1 = AlignmentPiece.Identical(
            line1.asSegment(),
            line.asSegment().prefix(length=len(line1)))
        al2 = AlignmentPiece.Identical(
            line2.asSegment(),
            line.asSegment().suffix(length=len(line2)))

        line.initial.addAll(
            line1.initial.targetAsSegment(al1.seg_to).merge(
                line2.initial.targetAsSegment(al2.seg_to)))
        line.correct_segments.addAll(
            line1.correct_segments.contigAsSegment(al1.seg_to).merge(
                line2.correct_segments.contigAsSegment(al2.seg_to)))
        line.completely_resolved.addAll(
            line1.completely_resolved.contigAsSegment(al1.seg_to).merge(
                line2.completely_resolved.contigAsSegment(al2.seg_to), k))
        line.disjointig_alignments.addAll(
            line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge(
                line2.disjointig_alignments.targetAsSegment(al2.seg_to)))
        for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge(
                line2.read_alignments.targetAsSegment(al2.seg_to)):
            line.addReadAlignment(al)
        line1.cleanReadAlignments()
        line2.cleanReadAlignments()

        self.notifyMergedLines(al1, al2)
        knot_right = line2.knot
        knot_left = line1.rc.knot
        self.remove(line1)
        self.remove(line2)
        if knot_right is not None:
            if knot_right.line_right == line1:
                line.tie(line, knot_right.gap, knot_right.gap_seq)
            else:
                line.tie(knot_right.line_right, knot_right.gap,
                         knot_right.gap_seq)
        if knot_left is not None and knot_left.line_right != line2.rc:
            line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq)
        return line
Esempio n. 25
0
 def load(self, handler, collection_from, collection_to):
     # type: (TokenReader, Any, Any) -> None
     n = handler.readInt()
     for i in range(n):
         self.add(AlignmentPiece.load(handler, collection_from, collection_to))