def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000): # type: (ContigStorage, int, int) -> None for contig in contigs.unique(): if not basic.isCanonocal(contig.id): contig = contig.rc if len(contig) > max_contig: line1 = self.addNew(contig.seq[:cut_size], "L" + contig.id + "l") line2 = self.addNew(contig.seq[-cut_size:], "L" + contig.id + "r") line1.initial.add( AlignmentPiece.Identical( contig.asSegment().prefix(length=cut_size), line1.asSegment())) line2.initial.add( AlignmentPiece.Identical( contig.asSegment().suffix(length=cut_size), line2.asSegment())) line1.tie(line2, len(contig) - 2 * cut_size, contig.seq[cut_size:-cut_size]) else: line = self.addNew(contig.seq, "L" + contig.id) line.initial.add( AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))
def splitLine(self, seg): # type: (Segment) -> Tuple[NewLine, NewLine] sys.stdout.trace("Line operation Split", seg) line = seg.contig # type: NewLine seg1 = line.asSegment().prefix(pos=seg.right) line1 = self.addNew(seg1.Seq(), line.id + "l") seg2 = line.asSegment().suffix(pos=seg.left) line2 = self.addNew(seg2.Seq(), line.id + "r") al1 = AlignmentPiece.Identical(seg1, line1.asSegment()) al2 = AlignmentPiece.Identical(seg2, line2.asSegment()) line1.initial.addAll([ al.embed(al1) for al in line.initial.allInter(seg1, params.min_alignment_size) ]) line2.initial.addAll([ al.embed(al2) for al in line.initial.allInter(seg2, params.min_alignment_size) ]) line1.correct_segments.addAll( line.correct_segments.cap(seg=seg1, min_inter=params.k).map(al1)) line2.correct_segments.addAll( line.correct_segments.cap(seg=seg2, min_inter=params.k).map(al2)) line1.completely_resolved.addAll( line.completely_resolved.cap( seg=seg1, min_inter=params.k).map(al1).filterBySize(min=params.k)) line2.completely_resolved.addAll( line.completely_resolved.cap( seg=seg2, min_inter=params.k).map(al2).filterBySize(min=params.k)) line1.disjointig_alignments.addAll([ al.embed(al1) for al in line.disjointig_alignments.allInter(seg1, params.k) ]) line2.disjointig_alignments.addAll([ al.embed(al2) for al in line.disjointig_alignments.allInter(seg2, params.k) ]) for al in line.read_alignments: if al.seg_to.interSize(seg1) > params.k: line1.addReadAlignment(al.embed(al1)) for al in line.read_alignments: if al.seg_to.interSize(seg2) > params.k: line2.addReadAlignment(al.embed(al2)) line.cleanReadAlignments() self.notifySplitLine(al1, al2) self.remove(line) if line.knot is not None: line2.tie(line.knot.line_right, line.knot.gap, line.knot.gap_seq) if line.rc.knot is not None: line1.rc.tie(line.rc.knot.line_right, line.rc.knot.gap, line.rc.knot.gap_seq) return line1, line2
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def testManual(self): contig1 = Contig("ACGTTAAACGT", "from") contig2 = Contig("ACGTTTAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "4M1D2M1I4M", str(al1.asMatchingStrings()) contig1 = Contig("ACATGATCACT", "from") contig2 = Contig("ACGTGAAACGT", "to") al = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al1 = self.scorer.polyshAlignment(al, params.alignment_correction_radius) assert al1.cigar == "6M1I3M1D1M", str(al1.asMatchingStrings())
def test3(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line = lines.addNew("ACGTACGTACGT", "c") dp = LineDotPlot(lines, self.aligner) al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12)) al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12)) dp.addAlignment(al1) dp.addAlignment(al2) alignment = AlignmentPiece.Identical( Contig("TCC", "tmp").asSegment(), line.segment(3, 6)) line.correctSequence([alignment]) assert str( list(dp.auto_alignments["c"]) ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
def test1(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1") line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()), params.alignment_correction_radius) dp = LineDotPlot(lines, self.aligner) dp.addAlignment(al) alignment = AlignmentPiece.Identical( Contig("AGG", "tmp").asSegment(), line2.segment(0, 3)) line2.correctSequence([alignment]) assert str(list(dp.alignmentsToFrom[line2.id][ line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
def testManual(self): contig1 = Contig("ACGTACGTACGT", "c1") contig2 = Contig("ACGTAGGTACGT", "c2") contig3 = Contig("ACTTACGTACGT", "c3") al1 = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al2 = AlignmentPiece.Identical(contig2.asSegment(), contig3.asSegment()) al3 = al1.compose(al2) assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al3.cigar == "12M" al4 = al1.reverse() al5 = al4.composeTargetDifference(al2) assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al5.cigar == "12M"
def __iter__(self): # type: () -> Generator[AlignmentPiece] for al in self.content: yield al for al in self.content: yield al.reverse() yield AlignmentPiece.Identical(self.line.asSegment())
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])): if len(al.seg_to) >= min(params.k, len(line) - 100): tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) line.correct_segments.add(line.asSegment().shrink(100)) line.completely_resolved.add(line.asSegment().shrink(100)) line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def testManual(self): contig1 = Contig("ACGTACGTACGT", "from") contig2 = Contig("ACGTACGTACGT", "to") al1 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(0, 4)) al2 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(4, 8)) al3 = AlignmentPiece.Identical(contig1.segment(4, 8), contig2.segment(8, 12)) storage = AlignmentStorage() storage.addAll([al1, al2, al3]) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" assert str( list(storage.rc) ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]" assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]" assert str(list(storage.filterByCoverage(0, 1))) == "[]" assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]" assert str(list(storage.filterByCoverage(2))) == "[]" storage.addAndMergeRight(al3) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" al4 = AlignmentPiece.Identical(contig1.segment(2, 8), contig2.segment(2, 8)) al5 = AlignmentPiece.Identical(contig1.segment(4, 10), contig2.segment(4, 10)) storage.addAll([al4, al5]) assert str( list(storage.calculateCoverage()) ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]" assert str(list(storage.filterByCoverage( 2, 3))) == "[to[2:4], to[8:12-2]]" assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]" assert str( list(storage.getAlignmentsTo(contig2.segment(2, 3))) ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]" assert str(list(storage.getAlignmentsTo(contig2.segment( 2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
def testManual(self): contig1 = Contig("ACGTAAAAGGGTACGT", "c1") contig2 = Contig("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(contig1.segment(5, 12), contig2.segment(5, 12)), params.alignment_correction_radius) corr = Correction(contig1, contig2, [al]) assert corr.mapPositionsUp(range(len(contig2))) == [ 0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15 ] assert corr.mapPositionsDown(range(len(contig1))) == [ 0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15 ] al2 = AlignmentPiece.Identical(contig2.segment(0, 4)) al3 = AlignmentPiece.Identical(contig2.segment(6, 8)) al4 = AlignmentPiece.Identical(contig2.segment(6, 16)) al5 = AlignmentPiece.Identical(contig2.segment(7, 16)) assert str( corr.composeQueryDifferences([al2, al3, al4, al5]) ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher): # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage sys.stdout.info("Initializing lines from init file", lc_file) lines = NewLineStorage(disjointigs, aligner) f = TokenReader(open(lc_file, "r")) n = f.readInt() for i in range(n): id = f.readToken() contig = contigs[id] assert contig.id == id line = lines.addNew(contig.seq, contig.id) read_ids = f.readTokens() als = [] line_reads = [reads[rid] for rid in read_ids] if len(line_reads) == 0: sys.stdout.warn("No read alignments in initialization for line", line.id, "Realigning all reads") line_reads = reads for al in aligner.overlapAlign(line_reads, ContigStorage([line])): if len(al.seg_to) >= min(1500, len(line) - 100): als.append(al) als = sorted(als, key=lambda al: (al.seg_from.contig.id, -int( al.percentIdentity() * 100), -len(al))) for key, read_als in itertools.groupby( als, key=lambda al: al.seg_from.contig.id): al = list(read_als)[0] tmp_line = al.seg_to.contig # type: NewLine tmp_line.addReadAlignment(al) correct_seg = line.asSegment().shrink(100) if len(line) < params.k + 200: new_contig, new_als = polisher.polishEnd( list(line.read_alignments), max_extension=params.k + 100 - len(line)) line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als) if len(correct_seg) < params.k: correct_seg = correct_seg.expandRight(params.k - len(correct_seg)) line.correct_segments.add(correct_seg) line.completely_resolved.add(correct_seg) line.initial.add( AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment())) sys.stdout.trace("Final list of lines:") for line in lines.unique(): sys.stdout.trace(line, line.completely_resolved) lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w")) lines.alignDisjointigs() sys.stdout.info("Constructing line dot plot") return lines
def test5(self): dataset = TestDataset("abcABC") name1 = dataset.addContig("abc") name2 = dataset.addContig("ABC") lines, dp, reads = dataset.genAll(self.aligner) line = lines[name1] sa = dataset.alphabet["a"].seq sb = dataset.alphabet["b"].seq tmp = Contig( sa + "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA" + sb, "tmp") al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)), line.prefix(len=len(sa))) al2 = AlignmentPiece.Identical( tmp.asSegment().suffix(length=len(sb)), line.segment(len(sa), len(sa) + len(sb))) al = AlignmentPiece.MergeFittingAlignments([al1, al2]) line.correctSequence([al]) assert str( list(dp.allInter(line.asSegment())) ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
def loadLine(self, handler, disjointigs, reads, contigs): # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None self.id = handler.readToken() self.seq = handler.readToken() self.rc.id = basic.Reverse(self.id) n = handler.readInt() for i in range(n): handler.readToken() handler.readToken() handler.readToken() seg = Segment.load(handler, self) handler.readToken() self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg)) # self.add(AlignmentPiece.load(handler, collection_from, collection_to)) self.correct_segments.load(handler, self) self.completely_resolved.load(handler, self) self.disjointig_alignments.load(handler, disjointigs, self) self.read_alignments.load(handler, reads, self) for al in self.read_alignments: read = al.seg_from.contig #type: AlignedRead read.addAlignment(al) self.max_extension = False
def genAll(self, aligner): # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection] disjointigs = DisjointigCollection() for dis in self.disjointigs: disjointigs.addNew(dis.seq, dis.id) from disjointig_resolve.line_storage import NewLineStorage lines = NewLineStorage(disjointigs, aligner) lines.name_printer = lambda line: line.id + "_" + self.translateBack( line, aligner) for line in self.contigs: new_line = lines.addNew(line.seq, line.id) new_line.initial.add( AlignmentPiece.Identical( new_line.asSegment().asContig().asSegment(), new_line.asSegment())) dp = LineDotPlot(lines, aligner) dp.construct(aligner) lines.alignDisjointigs() reads = ReadCollection() for read in self.reads: reads.addNewRead(read) disjointigs.addAlignments(aligner.localAlign(reads, disjointigs)) return lines, dp, reads
def mergeLines(self, alignment, k): # type: (AlignmentPiece, int) -> NewLine sys.stdout.trace("Line operation Merge", alignment.seg_from.contig, alignment.seg_to.contig, alignment) line1 = alignment.seg_from.contig #type: NewLine line2 = alignment.seg_to.contig #type: NewLine assert line1 != line2 if len(alignment) < k + 100: sys.stdout.trace( "Prolonging line to ensure alignment of at least k") seg = line2.segment( alignment.seg_to.right, alignment.seg_to.right + k + 100 - len(alignment)) line1.extendRight(seg.Seq()) alignment = alignment.mergeDistant( AlignmentPiece.Identical( line1.asSegment().suffix(length=len(seg)), seg)) # Cutting hanging tips of both lines al_storage = AlignmentStorage() al_storage.add(alignment) storage = TwoLineAlignmentStorage(line1, line2) line2.addListener(storage) line1.addListener(storage.reverse) storage.add(alignment) if alignment.seg_from.right < len(line1): line1.cutRight(alignment.seg_from.right) sys.stdout.trace("Cut right") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) if alignment.seg_to.left > 0: line2.rc.cutRight(len(line2) - alignment.seg_to.left) sys.stdout.trace("Cut left") sys.stdout.trace(list(storage.content)[0]) sys.stdout.trace("\n".join( list(storage.content)[0].asMatchingStrings())) sys.stdout.trace(list(storage.content)[0].cigar) alignment = list(storage.content)[0] # type: AlignmentPiece line2.removeListener(storage) line1.removeListener(storage.reverse) # Making sure line sequences match on the overlap if alignment.seg_from.left > 0: new_seq = Contig( line1.asSegment().prefix(pos=alignment.seg_from.left).Seq() + line2.seq, "new_seq") else: new_seq = Contig(line2.seq, "new_seq") al2 = AlignmentPiece.Identical( line2.asSegment(), new_seq.asSegment().suffix(length=len(line2))) sys.stdout.trace("Al2:", al2) alignment = alignment.compose(al2).reverse() sys.stdout.trace("Composed alignment", alignment) sys.stdout.trace("\n".join(alignment.asMatchingStrings())) sys.stdout.trace(alignment.cigar) assert alignment.seg_to.right == len(line1) assert alignment.seg_from.left == al2.seg_to.left line1.correctSequence([alignment]) # Now lines have exact match name = "(" + ",".join( basic.parseLineName(line1.id) + basic.parseLineName(line2.id)) + ")" line = self.addNew(new_seq.seq, name) assert line.seq.startswith(line1.seq) assert line.seq.endswith(line2.seq) al1 = AlignmentPiece.Identical( line1.asSegment(), line.asSegment().prefix(length=len(line1))) al2 = AlignmentPiece.Identical( line2.asSegment(), line.asSegment().suffix(length=len(line2))) line.initial.addAll( line1.initial.targetAsSegment(al1.seg_to).merge( line2.initial.targetAsSegment(al2.seg_to))) line.correct_segments.addAll( line1.correct_segments.contigAsSegment(al1.seg_to).merge( line2.correct_segments.contigAsSegment(al2.seg_to))) line.completely_resolved.addAll( line1.completely_resolved.contigAsSegment(al1.seg_to).merge( line2.completely_resolved.contigAsSegment(al2.seg_to), k)) line.disjointig_alignments.addAll( line1.disjointig_alignments.targetAsSegment(al1.seg_to).merge( line2.disjointig_alignments.targetAsSegment(al2.seg_to))) for al in line1.read_alignments.targetAsSegment(al1.seg_to).merge( line2.read_alignments.targetAsSegment(al2.seg_to)): line.addReadAlignment(al) line1.cleanReadAlignments() line2.cleanReadAlignments() self.notifyMergedLines(al1, al2) knot_right = line2.knot knot_left = line1.rc.knot self.remove(line1) self.remove(line2) if knot_right is not None: if knot_right.line_right == line1: line.tie(line, knot_right.gap, knot_right.gap_seq) else: line.tie(knot_right.line_right, knot_right.gap, knot_right.gap_seq) if knot_left is not None and knot_left.line_right != line2.rc: line.rc.tie(knot_left.line_right, knot_left.gap, knot_left.gap_seq) return line
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def fillFromContigs(self, contigs): # type: (Iterable[Contig]) -> None for contig in UniqueList(contigs): line = self.addNew(contig.seq, "L" + contig.id) line.initial.add( AlignmentPiece.Identical(contig.asSegment(), line.asSegment()))