def __init__(self, seq, id, extension_handler, rc = None): # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None self.extensionHandler = extension_handler self.seq = seq self.id = id # type: str self.circular = False self.name_printer = None self.max_extension = False if rc is None: self.initial = AlignmentStorage() self.correct_segments = SegmentStorage() self.completely_resolved = SegmentStorage() self.disjointig_alignments = AlignmentStorage() self.read_alignments = ReadAlignmentStorage() self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener] rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine self.rc = rc self.addListener(ReadAlignmentListener(self)) # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment())) else: self.initial = rc.initial.rc # type: AlignmentStorage self.correct_segments = rc.correct_segments.rc # type: SegmentStorage self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener] Contig.__init__(self, seq, id, rc) self.rc = rc #type: NewLine self.knot = None # type: Knot
def splitBad(self, lines): # type: (NewLineStorage) -> None all_covs = [] for line in lines: for rec in line.read_alignments.calculateCoverage(params.k): all_covs.append(rec) median = self.medianCoverage(all_covs) sys.stdout.info("Median coverage determined as", median) lids = [line.id for line in lines.unique()] for line_id in lids: line = lines[line_id] s = AlignmentStorage() s.addAll(al for al in line.read_alignments if not al.contradictingRTC()) segs = SegmentStorage().addAll(s.filterByCoverage(mi=params.reliable_coverage, ma=median * 7 /4, k=params.k)) segs.mergeSegments(max(params.k - params.bad_end_length * 2, params.k / 2)) if len(segs) == 0: sys.stdout.warn("No part of a unique edge is covered by reads", line.id) lines.removeLine(line) continue if len(segs) == 1 and len(segs[0]) > len(line) - 10: sys.stdout.info("Whole line", line.id, "is covered by reads") continue sys.stdout.info( "Line", line.id, "has poorly covered regions. Splitting into", len(segs), "parts") sys.stdout.trace(segs) next_left = segs[-1].left line.cutRight(segs[-1].right) for seg in list(segs)[-2::-1]: if next_left < seg.right: line, new_line = lines.splitLine(line.segment(next_left, seg.right)) else: line, new_line = lines.splitLine(line.segment(next_left, next_left)) line.cutRight(seg.right) next_left = seg.left line.rc.cutRight(len(segs[0]))
def attemptJump(self, rec): # type: (Record) -> bool bound = self.findAndFilterResolvedBound(rec, params.l) bad_segments = SegmentStorage() for al in rec: if al.seg_to.left > bound: break if al.seg_from.left > min(params.bad_end_length, params.k / 2) and \ al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) for al in self.dot_plot.allInter( rec.line.segment(rec.resolved.right - params.k, bound)): if al.seg_from.left > min(params.bad_end_length, params.k / 2): if al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) bad_segments.mergeSegments(params.k - 200) sys.stdout.trace("Bad segments:", bad_segments) good_segments = bad_segments.reverse(rec.line, params.k - 100).reduce( rec.line.segment(rec.resolved.right - params.k, bound)) for seg in good_segments: seg = Segment(seg.contig, max(0, seg.left), seg.right) for seg1 in self.segmentsWithGoodCopies(rec.resolved, seg, params.k): if len(seg1) >= params.k and seg1.right > rec.resolved.right: rec.setResolved(seg1) return True return False
def segmentsWithGoodCopies(self, resolved, seg, inter_size): # type: (Segment, Segment, int) -> List[Segment] als = [ al for al in self.dot_plot.allInter(seg) if al.seg_from.left > 20 or al.rc.seg_to.left > 20 or al.isIdentical() ] segs = SegmentStorage() for al in als: line = al.seg_from.contig # type: NewLine if len(al.seg_to ) >= inter_size and al.seg_from.right > line.initial[ 0].seg_to.left: cap = al.seg_from.cap( line.suffix(pos=line.initial[0].seg_to.left)) incorrect = line.correct_segments.reverse( line, inter_size - 1).reduce(cap) matching = al.matchingSequence() sys.stdout.trace("Incorrect: ", line, cap, incorrect) for seg1 in incorrect: seg2 = matching.mapSegDown(seg.contig, seg1, mapIn=False) sys.stdout.trace( "Relevant unpolished k-mer segment alignment:", seg1, seg2) segs.add(seg2) if al.rc.seg_from.left < 50 and al.seg_to.right >= resolved.right - 100: segs.add( al.seg_to.contig.suffix( pos=al.seg_to.right).expand(inter_size + 100)) sys.stdout.trace("Incoming line:", resolved, seg, al) segs.mergeSegments(inter_size - 1) return list( segs.reverse(seg.contig, inter_size - 1 - max(100, inter_size / 10)).reduce(seg))
def evaluatePI(dir, contigs_file, initial_file, ref_file): basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False) initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False) ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False) segs = [] for al in aligner.overlapAlign(initial.unique(), contigs): if basic.isCanonocal(al.seg_to.contig.id): segs.append(al.seg_to) else: segs.append(al.rc.seg_to) segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id)) interesting = dict() print "Interesting segments:" for contig in contigs: interesting[contig.id] = [contig.asSegment()] for contig, segit in itertools.groupby(segs, lambda seg: seg.contig): csegs = SegmentStorage().addAll(segit) csegs.mergeSegments() csegs = csegs.reverse(contig) interesting[contig.id] = list(csegs) print list(csegs) print "Analysis of contigs" scorer = Scorer() for al in aligner.localAlign(contigs.unique(), ref): print al for seg in interesting[al.seg_from.contig.id]: if al.seg_from.expand(500).contains( seg) or al.seg_from.interSize(seg) > 40000: tmp_al = al.reduce(query=al.seg_from.cap(seg)) scorer.polyshMatching(tmp_al.matchingSequence(), params.score_counting_radius) print tmp_al.seg_from, tmp_al.seg_to, str(events) print "" print "Analysis of initial" for al in aligner.overlapAlign(initial, ref): scorer.polyshMatching(al.matchingSequence(), params.score_counting_radius) print al.seg_from, al.seg_to, str(events)
def polyshSegments(self, line, to_polysh): # type: (NewLine, Iterable[Segment]) -> List[Segment] segs = SegmentStorage() corrections = AlignmentStorage() line.addListener(segs) segs.addAll(to_polysh) segs.mergeSegments() segs.sort() for seg in segs: corrections.add( self.polisher.polishSegment( seg, list(line.read_alignments.allInter(seg)))) line.correctSequence(list(corrections)) line.removeListener(segs) return list(segs)
def correctSequences(self, interesting_segments): # type: (Iterable[Segment]) -> List[Segment] interesting_segments = list(interesting_segments) to_correct = [] for seg in interesting_segments: line = seg.contig # type: NewLine correct = line.correct_segments.find(seg) next = line.correct_segments.find(line.suffix(correct.right), 1) if next is None: right = len(line) else: right = min(len(line), next.left + params.k / 2) to_correct.append(line.segment(correct.right - params.k / 2, right)) to_correct = sorted(to_correct, key=lambda seg: (basic.Normalize(seg.contig.id), seg.left)) corrected = [] for line_id, it in itertools.groupby( to_correct, key=lambda seg: basic.Normalize( seg.contig.id)): # type: NewLine, Iterable[Segment] it = list(it) line = None # type: NewLine forward = SegmentStorage() backward = SegmentStorage() for seg in it: if seg.contig.id != line_id: backward.add(seg) line = seg.contig.rc else: forward.add(seg) line = seg.contig to_polysh = SegmentStorage() to_polysh.addAll(forward).addAll(backward.rc) to_polysh.mergeSegments() line.addListener(to_polysh) line.addListener(forward) line.rc.addListener(backward) sys.stdout.trace("Polishing:", to_polysh) if (not line.max_extension) and to_polysh[-1].RC().left < 200: l = to_polysh[-1].right if self.attemptExtend(line): to_polysh.add(line.asSegment().suffix(pos=l)) forward.add(line.asSegment().suffix(pos=l)) if (not line.rc.max_extension) and to_polysh[0].left < 200: l = to_polysh[0].RC().right if self.attemptExtend(line.rc): to_polysh.rc.add(line.rc.asSegment().suffix(pos=l)) backward.add(line.rc.asSegment().suffix(pos=l)) to_polysh.mergeSegments() forward.mergeSegments() backward.mergeSegments() line.removeListener(to_polysh) new_segments = self.polyshSegments(line, to_polysh) line.removeListener(forward) line.rc.removeListener(backward) corrected.extend(forward) corrected.extend(backward) line.updateCorrectSegments(line.asSegment()) return corrected
def testManual(self): contig = Contig("ACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 1)) storage.add(contig.segment(1, 2)) storage.add(contig.segment(2, 3)) storage.add(contig.segment(3, 4)) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) assert str( storage.rc ) == "ReadStorage-:[-test[0:1], -test[1:2], -test[2:4-1], -test[3:4-0]]", str( storage.rc) storage.mergeSegments(1) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) storage.mergeSegments() assert str(storage) == "ReadStorage+:[test[0:4-0]]", str(storage) assert str(storage.rc) == "ReadStorage-:[-test[0:4-0]]", str( storage.rc) contig = Contig("ACGTACGTACGTACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 5)) storage.add(contig.segment(10, 15)) assert storage.find(contig.segment(5, 10)) == contig.segment( 0, 5), str(storage.find(contig.segment(5, 10))) assert storage.find(contig.segment(6, 10)) == contig.segment( 10, 15), str(storage.find(contig.segment(6, 10))) assert storage.find(contig.segment(5, 9)) == contig.segment(0, 5), str( storage.find(contig.segment(5, 9))) assert storage.find(contig.segment(0, 16)) == contig.segment( 0, 5), str(storage.find(contig.segment(0, 16)))
class NewLine(Contig): def __init__(self, seq, id, extension_handler, rc = None): # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None self.extensionHandler = extension_handler self.seq = seq self.id = id # type: str self.circular = False self.name_printer = None self.max_extension = False if rc is None: self.initial = AlignmentStorage() self.correct_segments = SegmentStorage() self.completely_resolved = SegmentStorage() self.disjointig_alignments = AlignmentStorage() self.read_alignments = ReadAlignmentStorage() self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener] rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine self.rc = rc self.addListener(ReadAlignmentListener(self)) # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment())) else: self.initial = rc.initial.rc # type: AlignmentStorage self.correct_segments = rc.correct_segments.rc # type: SegmentStorage self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener] Contig.__init__(self, seq, id, rc) self.rc = rc #type: NewLine self.knot = None # type: Knot def updateCorrectSegments(self, seg, threshold = params.reliable_coverage): # type: (Segment, int) -> None segs = AlignmentStorage().addAll(self.read_alignments.allInter(seg)).filterByCoverage(mi=threshold) self.correct_segments.addAll(segs) self.correct_segments.mergeSegments() def addReads(self, alignments): # type: (Iterable[AlignmentPiece]) -> None self.read_alignments.addAll(alignments) self.max_extension = False def getReadAlignmentsTo(self, seg): # type: (Segment) -> Iterable[AlignmentPiece] return self.read_alignments.getAlignmentsTo(seg) def getPotentialAlignmentsTo(self, seg): # type: (Segment) -> Generator[AlignmentPiece] result = [] for alDL in self.disjointig_alignments.getAlignmentsTo(seg): reduced = alDL.reduce(target=seg) dt = alDL.seg_from.contig # type: Disjointig for alRD in dt.getAlignmentsTo(reduced.seg_from): result.append(alRD.compose(alDL)) result = sorted(result, key = lambda al: (al.seg_from.contig.id, -len(al.seg_from))) for read, iter in itertools.groupby(result, key = lambda al: al.seg_from.contig): readRes = [] for al in iter: found = False for al1 in readRes: inter = al.matchingSequence(True).inter(al1.matchingSequence(True)) if len(inter.matches) != 0: found = True if not found: yield al readRes.append(al) def getRelevantAlignmentsFor(self, seg): # type: (Segment) -> Generator[AlignmentPiece] sys.stdout.trace("Requesting read alignments for", seg) result = [] if params.debug: print self.disjointig_alignments print list(self.disjointig_alignments.allInter(seg)) for alDL in self.disjointig_alignments.allInter(seg): if len(alDL.seg_to) < params.k: continue reduced = alDL.reduce(target=seg) dt = alDL.seg_from.contig # type: Disjointig cnt = 0 als = filter(lambda al: al.seg_to.interSize(alDL.seg_from) > 8 * params.k / 10, dt.allInter(reduced.seg_from)) compositions = alDL.massComposeBack(als) for al in compositions: if len(al.seg_to) >= params.k: result.append(al) cnt += 1 sys.stdout.trace("Request for read alignments for", seg, " collecting finished. Started filtering") result = sorted(result, key = lambda al: (al.seg_from.contig.id, -len(al.seg_from))) for read, iter in itertools.groupby(result, key = lambda al: al.seg_from.contig): # type: AlignedRead, Generator[AlignmentPiece] readRes = [] for al in iter: found = False for al1 in readRes: inter = al.matchingSequence(True).inter(al1.matchingSequence(True)) if len(inter.matches) != 0: found = True break if not found: if params.debug: print al yield al readRes.append(al) sys.stdout.trace("Request for read alignments for", seg, "finished") def position(self, pos): # type: (int) -> LinePosition return LinePosition(self, pos) def extendRight(self, seq, relevant_als = None): # type: (str, List[AlignmentPiece]) -> None sys.stdout.trace("Line operation Extend:", self, len(seq), relevant_als) assert self.knot is None if relevant_als is None: relevant_als = [] new_seq = Contig(self.seq + seq, "TMP2_" + self.id) self.notifyBeforeExtendRight(new_seq, seq) self.seq = self.seq + seq self.rc.seq = basic.RC(seq) + self.rc.seq self.notifyAfterExtendRight(seq, relevant_als) self.updateCorrectSegments(self.asSegment()) self.max_extension = True def notifyBeforeExtendRight(self, new_seq, seq): # type: (Contig, str) -> None for listener in self.listeners: listener.fireBeforeExtendRight(self, new_seq, seq) def notifyAfterExtendRight(self, seq, relevant_als): # type: (str, Optional[List[AlignmentPiece]]) -> None for listener in self.listeners: listener.fireAfterExtendRight(self, seq, relevant_als) def cutRight(self, pos): sys.stdout.trace("Line operation Cut:", self, pos) assert pos > 0 and pos <= len(self) cut_length = len(self) - pos if cut_length == 0: return new_seq = Contig(self.seq[:pos], "TMP3_" + self.id) self.notifyBeforeCutRight(new_seq, pos) self.seq = self.seq[:-cut_length] self.rc.seq = self.rc.seq[cut_length:] self.notifyAfterCutRight(pos) def notifyBeforeCutRight(self, new_seq, pos): # type: (Contig, int) -> None for listener in self.listeners: listener.fireBeforeCutRight(self, new_seq, pos) def notifyAfterCutRight(self, pos): # type: (int) -> None for listener in self.listeners: listener.fireAfterCutRight(self, pos) def correctSequence(self, alignments): # type: (Iterable[AlignmentPiece]) -> None sys.stdout.trace("Line operation Correct:", alignments) alignments = [al.cutIdenticalEnds() for al in alignments if al.seg_from.Seq() != al.seg_to.Seq()] if len(alignments) == 0: sys.stdout.trace("Skipping trivial correction operation") return assert len(alignments) > 0 correction = Correction.constructCorrection(alignments) self.notifyBeforeCorrect(correction) old = Contig(self.seq, "old") self.seq = correction.seq_from.seq self.rc.seq = basic.RC(self.seq) correction.changeQT(self, old) self.notifyAfterCorrect(correction) def notifyBeforeCorrect(self, alignments): # type: (Correction) -> None for listener in self.listeners: listener.fireBeforeCorrect(alignments) def notifyAfterCorrect(self, alignments): # type: (Correction) -> None for listener in self.listeners: listener.fireAfterCorrect(self, alignments) def addReadAlignment(self, al): # type: (AlignmentPiece) -> AlignmentPiece self.read_alignments.add(al) self.max_extension = False return al def addListener(self, listener): self.listeners.append(listener) self.rc.listeners.append(listener.rc) def removeListener(self, listener): self.listeners.remove(listener) self.rc.listeners.remove(listener.rc) def save(self, handler): # type: (TokenWriter) -> None handler.writeTokenLine(self.id) handler.writeTokenLine(self.seq) self.initial.save(handler) self.correct_segments.save(handler) self.completely_resolved.save(handler) self.disjointig_alignments.save(handler) self.read_alignments.save(handler) def loadLine(self, handler, disjointigs, reads, contigs): # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None self.id = handler.readToken() self.seq = handler.readToken() self.rc.id = basic.Reverse(self.id) n = handler.readInt() for i in range(n): handler.readToken() handler.readToken() handler.readToken() seg = Segment.load(handler, self) handler.readToken() self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg)) # self.add(AlignmentPiece.load(handler, collection_from, collection_to)) self.correct_segments.load(handler, self) self.completely_resolved.load(handler, self) self.disjointig_alignments.load(handler, disjointigs, self) self.read_alignments.load(handler, reads, self) for al in self.read_alignments: read = al.seg_from.contig #type: AlignedRead read.addAlignment(al) self.max_extension = False def __str__(self): if self.name_printer is not None: return self.name_printer(self) points = [self.left()] if len(self.initial) == 0: points.append("NA") else: points.append(self.initial[0].seg_to.left) points.append(self.initial[-1].seg_to.right) points.append(self.right()) points = map(str, points) return "Line:" + str(self.id) + ":" + "[" + ":".join(points) +"]" def __repr__(self): points = [self.left()] points.extend(self.initial) points.append(self.right()) points = map(str, points) return "Line:" + str(self.id) + ":" + "[" + ":".join(points) +"]" def setCircular(self): self.circular = True self.rc.circular = True def cleanReadAlignments(self): for read in self.read_alignments: read.seg_from.contig.removeContig(self) self.read_alignments.clean() self.max_extension = False def tie(self, other, gap, gap_seq): self.knot = Knot(self, other, gap, gap_seq) other.rc.knot = self.knot.rc if self == other: self.setCircular() def unTie(self): if self.knot is not None: self.knot.line_right.rc.knot = None if self.knot is not None: self.knot = None
def markUniqueInLine(self, line): # type: (NewLine) -> None sys.stdout.info("Finding unique in", line) alignments = list(line.read_alignments) # type: List[AlignmentPiece] alignments = sorted(alignments, key=lambda al:al.seg_to.left) sys.stdout.trace("Sorting finished") inc = self.link(line, [al.seg_to.left for al in alignments if al.seg_from.left > 1000 and al.seg_to.left > 50], 20) inc.append((line.segment(len(line) - 1, len(line)), params.min_k_mer_cov)) alignments = sorted(alignments, key=lambda al:al.seg_to.right) out = self.link(line, [al.seg_to.right for al in alignments if al.rc.seg_from.left > 1000 and al.rc.seg_to.left > 50 ], 20) sys.stdout.trace("Linking finished") out.insert(0, (line.segment(0, 1), params.min_k_mer_cov)) sys.stdout.trace( "inc:", inc) sys.stdout.trace( "out:", out) events = [] for seg, val in inc: if val >= params.min_k_mer_cov: events.append((seg.left, -1)) for seg, val in out: if val >= params.min_k_mer_cov: events.append((seg.right, 1)) events= sorted(events) sys.stdout.trace("Events collected and sorted", len(events)) events = [(pos, dir) for pos, dir in events if (dir == -1 or pos < len(line) - 200) and (dir == 1 or pos > - 200)] sys.stdout.trace( events) segs = SegmentStorage() for e1, e2 in zip(events[:-1], events[1:]): seg = line.segment(e1[0], e2[0]) if e1[1] == 1 and e2[1] == -1: if len(seg) > params.max_allowed_unaligned: seg = seg.expand(params.k / 2).expandToSize(params.k + 50) if len(seg) >= params.k: segs.add(seg) elif len(seg) > 50000: segs.add(seg.shrink(3000)) sys.stdout.trace("Unique segments selected") line.cleanReadAlignments() line.read_alignments.clean() all = 0 inter = 0 contradicting = 0 bad_quality = 0 sys.stdout.trace( "Unique segments:", segs) if len(segs) == 0: sys.stdout.trace( "WARNING: line with no resolved segments. Removing", line) return for al in alignments: all += 1 if segs.inter(al.seg_to, params.k): inter += 1 if al.contradictingRTC(tail_size=params.bad_end_length): contradicting += 1 sys.stdout.trace( "Contradicting read alignment", al, str(al.seg_from.contig.alignments)) elif al.percentIdentity() < 0.85: bad_quality += 1 sys.stdout.trace( "Read with bad alignment quality:", al) else: line.addReadAlignment(al) sys.stdout.trace("Read recruitment results. All:", all, "In resolved regions:", inter, "Contradicting:", float(contradicting) / inter, "Bad quality", float(bad_quality) / inter) line.updateCorrectSegments(line.asSegment()) segs = segs.cap(line.correct_segments, params.k) line.completely_resolved.addAll(segs) sys.stdout.trace("The end")