def polishSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece sys.stdout.trace("Polishing segment", seg) w = max(900, params.k) r = 50 first = seg.left / w last = min(seg.right + w - 1, len(seg.contig)) / w segs = [] for i in range(first, last + 1): segs.append( Segment(seg.contig, max(0, i * w - r), min(len(seg.contig), (i + 1) * w + r))) als_by_segment = [[] for i in range(last - first + 1)] for al in als: l = al.seg_to.left / w r = al.seg_to.right / w + 1 for i in range(max(0, l - first - 1), min(last - first + 1, r - first + 2)): if al.seg_to.inter(segs[i]): als_by_segment[i].append(al) res_als = [] for seg1, seg_als in zip(segs, als_by_segment): if seg1.inter(seg): res_als.append(self.polishSmallSegment(seg1, seg_als)) res = AlignmentPiece.GlueOverlappingAlignments(res_als) return res.reduce(target=seg)
def attemptJump(self, rec): # type: (Record) -> bool bound = self.findAndFilterResolvedBound(rec, params.l) bad_segments = SegmentStorage() for al in rec: if al.seg_to.left > bound: break if al.seg_from.left > min(params.bad_end_length, params.k / 2) and \ al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) for al in self.dot_plot.allInter( rec.line.segment(rec.resolved.right - params.k, bound)): if al.seg_from.left > min(params.bad_end_length, params.k / 2): if al.rc.seg_from.left > min(params.bad_end_length, params.k / 2): bad_segments.add(al.seg_to) bad_segments.mergeSegments(params.k - 200) sys.stdout.trace("Bad segments:", bad_segments) good_segments = bad_segments.reverse(rec.line, params.k - 100).reduce( rec.line.segment(rec.resolved.right - params.k, bound)) for seg in good_segments: seg = Segment(seg.contig, max(0, seg.left), seg.right) for seg1 in self.segmentsWithGoodCopies(rec.resolved, seg, params.k): if len(seg1) >= params.k and seg1.right > rec.resolved.right: rec.setResolved(seg1) return True return False
def constructCorrection(alignments): # type: (List[AlignmentPiece]) -> Correction initial = alignments[0].seg_to.contig alignments = sorted(alignments, key=lambda al: al.seg_to.left) sb = [] pos = initial.left() new_pos = 0 for al in alignments: sb.append(initial.subSequence(pos, al.seg_to.left).seq) new_pos += al.seg_to.left - pos pos = al.seg_to.left sb.append(al.seg_from.Seq()) new_pos += al.seg_from.__len__() pos = al.seg_to.right sb.append( initial.segment(alignments[-1].seg_to.right, initial.right()).Seq()) new_pos += initial.right() - alignments[-1].seg_to.right new_seq = Contig("".join(sb), "TMP1_" + initial.id) new_als = [] pos = initial.left() new_pos = 0 for al in alignments: new_pos += al.seg_to.left - pos new_seg_from = Segment(new_seq, new_pos, new_pos + al.seg_from.__len__()) new_als.append(al.changeQuerySegment(new_seg_from)) pos = al.seg_to.right new_pos += al.seg_from.__len__() return Correction(new_seq, initial, new_als)
def fireBeforeExtendRight(self, line, new_seq, seq): # type: (accurate_line.NewLine, Contig, str) -> None self.makeCanonical() self.items = [ al.targetAsSegment(Segment(new_seq, 0, len(line))) for al in self.items ]
def fireBeforeCutRight(self, line, new_seq, pos): # type: (accurate_line.NewLine, Contig, int) -> None self.makeCanonical() new_items = [] for al in self.items: # type: AlignmentPiece if al.seg_to.right <= pos: new_items.append(al.changeTargetContig(new_seq)) elif al.seg_to.left <= pos - params.k: new_items.append(al.reduce(target=Segment(line, line.left(), pos)).changeTargetContig(new_seq)) self.items = new_items # type: List[Segment]
def mergeSegments(self, inter_size = 0): # type: (int) -> None if self.isCanonical(): self.sort() if len(self.items) == 0: return res = [self.items[0]] # type: List[Segment] for seg in self.items[1:]: # type: Segment if seg.left <= res[-1].right - inter_size or seg.right <= res[-1].right: res[-1] = Segment(res[-1].contig, res[-1].left, max(res[-1].right, seg.right)) else: res.append(seg) res = [seg for seg in res if len(seg) >= inter_size] self.items = res else: self.rc.mergeSegments(inter_size)
def main(ref_file, segment, dir): ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False) chr1 = ref["chr1"] if segment[0] < 0: segment = (-segment[0], -segment[1]) chr1 = chr1.rc reads = ReadCollection() reads_list = [] for i in range(segment[0], segment[1], 500): read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence()) reads_list.append(read) chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:] chr1.rc.seq = basic.RC(chr1.seq) basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, ref) out = sys.stdout for read in reads_list: # print read out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n") out.close()
def loadLine(self, handler, disjointigs, reads, contigs): # type: (TokenReader, DisjointigCollection, ReadCollection, ContigCollection) -> None self.id = handler.readToken() self.seq = handler.readToken() self.rc.id = basic.Reverse(self.id) n = handler.readInt() for i in range(n): handler.readToken() handler.readToken() handler.readToken() seg = Segment.load(handler, self) handler.readToken() self.initial.add(AlignmentPiece.Identical(seg.asContig().asSegment(), seg)) # self.add(AlignmentPiece.load(handler, collection_from, collection_to)) self.correct_segments.load(handler, self) self.completely_resolved.load(handler, self) self.disjointig_alignments.load(handler, disjointigs, self) self.read_alignments.load(handler, reads, self) for al in self.read_alignments: read = al.seg_from.contig #type: AlignedRead read.addAlignment(al) self.max_extension = False
def load(self, handler, contig): # type: (TokenReader, NamedSequence) -> None n = handler.readInt() for i in range(n): self.add(Segment.load(handler, contig))
def main(argv): sys.stdout.write("Started\n") dot_file = argv[1] edge_sequences = argv[2] reference_file = argv[3] alignment_file = argv[4] edges = ParseVertices(argv[5]) output_file = argv[6] sys.stdout.write("Loading dot\n") dot = DotParser(open(dot_file, "r")).parse() edge_collection = ContigCollection().loadFromFasta( open(edge_sequences, "r"), True) graph = Graph().loadFromDot(edge_collection, dot) vertices = [graph.E[id].start.id for id in edges] graph.printToFile(sys.stdout) print vertices ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) print "Looking for relevant" pq = PriorityQueue() for v in graph.V.values(): if v.id in vertices: pq.push((0, v)) visited = [] while not pq.empty(): d, v = pq.pop() if v in visited: continue visited.append(v) for e in v.inc: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.start)) for e in v.out: print e.id, e.start.id, e.end.id if d + len(e) < dist: pq.push((d + len(e), e.end)) print "Visited", len(visited) print map(str, list(visited)) relevant = [] edge_alignments = ReadCollection().loadFromFasta(open(edge_sequences, "r")).addAllRC() for edge in graph.E.values(): if edge.start in visited or edge.start.rc in visited: relevant.append(edge_alignments[edge.id]) print "Loading sam" edge_alignments.fillFromSam(Samfile(open(alignment_file, "r")), ref) for rel in relevant: print rel.__str__() print "Collecting segments" segments = [] chr1 = ref["chr1"] for edge in relevant: for al in edge.alignments: print al if al.seg_from.inter(edge.prefix(dist)): l = dist - al.seg_from.left contig = al.seg_to.contig start = al.seg_to.left segments.append( Segment(contig, start, min(start + l, len(contig)))) print segments[-1] tmp = [] print "Rotating" for seg in segments: if seg.contig != chr1: seg = seg.RC() if seg.contig != chr1: print "WARNING", seg tmp.append(seg) segments = sorted(tmp, key=lambda seg: seg.left) print "All relevant segments" print "\n".join(map(str, segments)) cur_seg = None interesting_segments = [] print "Gluing" for seg in segments: if cur_seg is None: cur_seg = seg.copy() continue if cur_seg.right + 20000 < seg.left: interesting_segments.append(cur_seg.copy()) cur_seg = seg.copy() else: cur_seg.right = max(cur_seg.right, seg.right) if cur_seg is not None: interesting_segments.append(cur_seg.copy()) alignments = [] for edge in edge_alignments: for al in edge.alignments: ok = False for seg in interesting_segments: if al.seg_to.inter(seg): alignments.append(al) alignments = sorted(alignments, key=lambda al: al.seg_to.left) print "All relevant alignments" print "\n".join(map(str, alignments)) print "Interesting segments:", len(interesting_segments), sum( map(len, interesting_segments)) for seg in interesting_segments: print seg f = open(output_file, "w") tmp = [] for seg in interesting_segments: SeqIO.write(SeqIO.SeqRecord(seg.Seq(), seg.__str__()), f, "fasta") tmp.append(seg.Seq()) f.close() f1 = open(output_file + "1", "w") SeqIO.write(SeqIO.SeqRecord(("N" * 20000).join(tmp), "concat"), f1, "fasta")
def mapSegmentsDown(self, segments): # type: (Iterable[Segment]) -> List[Segment] segments = list(segments) left = self.mapPositionsDown([seg.left for seg in segments]) right = self.mapPositionsDown([seg.right - 1 for seg in segments]) return [Segment(self.seq_to, l, r + 1) for l, r in zip(left, right)]
def mapSegmentsUp(self, segments): # type: (List[Segment]) -> List[Segment] left = self.mapPositionsUp([seg.left for seg in segments]) right = self.mapPositionsUp([seg.right - 1 for seg in segments]) return [Segment(self.seq_from, l, r + 1) for l, r in zip(left, right)]