def polishAndAnalyse(self, reads, polishing_base, reliable_start = None): # type: (ReadCollection, Contig, Optional[int]) -> Consensus if reliable_start is None: reliable_start = len(polishing_base) seq = Contig(self.polish(reads, polishing_base), "contig") res = [0] * (len(seq) + 1) alignment = ReadCollection().extendClean(reads) self.aligner.alignReadCollection(alignment, [seq]) contra = 0 ok = 0 late = 0 for read in alignment: for al in read.alignmentsTo(seq.asSegment()):# type: AlignmentPiece if al.contradicting(seq.asSegment()): contra += 1 elif al.seg_to.left > reliable_start: late += 1 else: res[al.seg_to.left] += 1 res[al.seg_to.right] -= 1 ok += 1 for i in range(1, len(res)): res[i] += res[i - 1] sys.stdout.trace("Polyshed and analysed using", len(alignment), "reads. Ok:", ok, "late:", late, "contra:", contra) # if contra > 10 or contra > ok / 2: # for read in alignment: # print read # for al in read.alignmentsTo(seq.asSegment()): # if al.contradictingRTC(seq.asSegment()): # print "contra_al:", al # elif al.seg_to.left > reliable_start: # print "late_al:", al # else: # print "ok_al:", al return Consensus(seq.seq, res)
def __init__(self, seq, id, extension_handler, rc = None): # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None self.extensionHandler = extension_handler self.seq = seq self.id = id # type: str self.circular = False self.name_printer = None self.max_extension = False if rc is None: self.initial = AlignmentStorage() self.correct_segments = SegmentStorage() self.completely_resolved = SegmentStorage() self.disjointig_alignments = AlignmentStorage() self.read_alignments = ReadAlignmentStorage() self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener] rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine self.rc = rc self.addListener(ReadAlignmentListener(self)) # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment())) else: self.initial = rc.initial.rc # type: AlignmentStorage self.correct_segments = rc.correct_segments.rc # type: SegmentStorage self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener] Contig.__init__(self, seq, id, rc) self.rc = rc #type: NewLine self.knot = None # type: Knot
def align(dir, contigs_file): CreateLog(dir) contigs = list(SeqIO.parse_fasta(open(contigs_file, "r"))) assert len(contigs) == 2 contigs = [ Contig(contigs[0].seq, contigs[0].id), Contig(contigs[1].seq, contigs[1].id) ] aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) als = iter_align(aligner, contigs[0], contigs[1]) printVar(os.path.join(dir, "diff.txt"), als) for al in als: print al
def __init__(self, seq, id, rc=None): # type: (str, str, Optional[Disjointig]) -> None self.seq = seq self.id = id if rc is None: self.read_alignments = AlignmentStorage() # type: AlignmentStorage rc = Disjointig(basic.RC(seq), basic.Reverse(id), self) # type: Disjointig self.rc = rc else: self.rc = rc self.read_alignments = self.rc.read_alignments.rc # type: AlignmentStorage Contig.__init__(self, seq, id, rc) self.rc = rc # type:Disjointig
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def testManual(self): contig1 = Contig("ACGTACGTA", "from") contig2 = Contig("ACTACGTACGTACAT", "to") al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8), "2M1I6M") al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15), "8M") glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2]) assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar assert glued.seg_from.Seq( ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length): base = Contig(seq, "base") for i in range(len(seq) / min_contig_length): res = splitSegKmeans( aligner, base.segment(i * min_contig_length, i * min_contig_length + min_contig_length), mult, all_reads_list) if res is not None: return res res = splitSegKmeans( aligner, base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult, all_reads_list) return res
def constructCorrection(alignments): # type: (List[AlignmentPiece]) -> Correction initial = alignments[0].seg_to.contig alignments = sorted(alignments, key=lambda al: al.seg_to.left) sb = [] pos = initial.left() new_pos = 0 for al in alignments: sb.append(initial.subSequence(pos, al.seg_to.left).seq) new_pos += al.seg_to.left - pos pos = al.seg_to.left sb.append(al.seg_from.Seq()) new_pos += al.seg_from.__len__() pos = al.seg_to.right sb.append( initial.segment(alignments[-1].seg_to.right, initial.right()).Seq()) new_pos += initial.right() - alignments[-1].seg_to.right new_seq = Contig("".join(sb), "TMP1_" + initial.id) new_als = [] pos = initial.left() new_pos = 0 for al in alignments: new_pos += al.seg_to.left - pos new_seg_from = Segment(new_seq, new_pos, new_pos + al.seg_from.__len__()) new_als.append(al.changeQuerySegment(new_seg_from)) pos = al.seg_to.right new_pos += al.seg_from.__len__() return Correction(new_seq, initial, new_als)
def recruit(seqs, reads, k, dir): dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k relevant_reads = ContigStorage() disjointigs = seqs for i in range(2): sys.stdout.info("Recruiting iteration", i) als = filter(lambda al: len(al) > k, aligner.localAlign(reads, disjointigs)) print len(als), "alignments" relevant_reads = alsToReads(als) l = sum(map(len, seqs.unique())) disjointigs = constructDisjointigs(relevant_reads, l, dd.nextDir()) print len(disjointigs), "disjointigs" print disjointigs disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w")) relevant_reads.writeToFasta(open(os.path.join(dir, "reads.fasta"), "w")) sys.stdout.info("Aligning repeat sequences to disjointigs") als = list(aligner.localAlign(seqs, disjointigs)) print "\n".join(map(str, als)) starts = dict() for dis in disjointigs: starts[dis.id] = len(dis) for al in als: if len(al) > k: starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) al = al.rc starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left) print "Starts:" for cid, val in starts.items(): print cid, val contigs = ContigStorage() cnt = 1 for dis in disjointigs: if starts[dis.id] > k and starts[dis.id] < len(dis): print cnt, dis.id, starts[dis.id] contigs.add(Contig(dis.prefix(starts[dis.id]).Seq(), str(cnt))) cnt += 1 for dis in disjointigs.unique(): if len(dis) > k and starts[dis.id] == len(dis): print cnt, dis.id contigs.add(Contig(dis.seq, str(cnt))) cnt += 1 contigs.writeToFasta(open(os.path.join(dir, "contigs.fasta"), "w")) fakeGraph(contigs, open(os.path.join(dir, "graph.gv"), "w"))
def testManual(self): contig1 = Contig("ACGTACGTACGT", "c1") contig2 = Contig("ACGTAGGTACGT", "c2") contig3 = Contig("ACTTACGTACGT", "c3") al1 = AlignmentPiece.Identical(contig1.asSegment(), contig2.asSegment()) al2 = AlignmentPiece.Identical(contig2.asSegment(), contig3.asSegment()) al3 = al1.compose(al2) assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al3.cigar == "12M" al4 = al1.reverse() al5 = al4.composeTargetDifference(al2) assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)" assert al5.cigar == "12M"
def alignReadsToSegments(self, reads, segments): # type: (ReadCollection, Iterable[Segment]) -> None segments = list(segments) seg_dict = dict() for i, seg in enumerate(segments): seg_dict[str(i + 1)] = seg contigs = map(lambda (i, seg): Contig(seg.Seq(), str(i + 1)), enumerate(segments)) read_collection = ReadCollection().extendClean(reads) self.alignReadCollection(read_collection, ContigCollection(contigs)) read_collection.contigsAsSegments(seg_dict) reads.mergeAlignments(read_collection)
def cutRight(self, pos): sys.stdout.trace("Line operation Cut:", self, pos) assert pos > 0 and pos <= len(self) cut_length = len(self) - pos if cut_length == 0: return new_seq = Contig(self.seq[:pos], "TMP3_" + self.id) self.notifyBeforeCutRight(new_seq, pos) self.seq = self.seq[:-cut_length] self.rc.seq = self.rc.seq[cut_length:] self.notifyAfterCutRight(pos)
def extendRight(self, seq, relevant_als=None): # type: (str, List[AlignmentPiece]) -> None sys.stdout.trace("Line operation Extend:", self, len(seq), relevant_als) assert self.knot is None if relevant_als is None: relevant_als = [] new_seq = Contig(self.seq + seq, "TMP2_" + self.id) self.notifyBeforeExtendRight(new_seq, seq) self.seq = self.seq + seq self.rc.seq = basic.RC(seq) + self.rc.seq self.notifyAfterExtendRight(seq, relevant_als)
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump): # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None sys.stdout.info("Extending short lines") short_contigs = ContigStorage() als = dict() # type: Dict[str, List[AlignmentPiece]] for contig in contigs.unique(): if len(contig) < params.k + 500: short_contigs.add(contig) als[contig.id] = [] als[contig.rc.id] = [] if read_dump is not None: sys.stdout.trace("Using flye read dump file to extend short contigs") relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads) for contig in short_contigs: for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])): als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) else: sys.stdout.trace("Realigning all reads to extend short contigs") for al in aligner.overlapAlign(reads, short_contigs): if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20: added = False for i, al1 in enumerate(als[al.seg_to.contig.id]): if al1.seg_from.contig.id == al.seg_from.contig.id: added = True if al.percentIdentity() > al1.percentIdentity(): als[al.seg_to.contig.id][i] = al als[al.seg_to.contig.rc.id][i] = al.rc break if not added: als[al.seg_to.contig.id].append(al) als[al.seg_to.contig.rc.id].append(al.rc) for contig in short_contigs.unique(): if len(als[contig.id]) > 0: tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig)) r = len(tmp_contig) - len(contig) tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig)) l = len(tmp_contig) - len(contig) - r else: tmp_contig, new_als = contig, als[contig.id] l = 0 r = 0 # if l > params.k / 2 and r > params.k / 2: # tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2] # else: # tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)] if len(tmp_contig) > params.k + 500: sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right") contigs.add(Contig(tmp_contig.rc.seq, contig.id)) else: sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.") contigs.remove(contig)
def test5(self): dataset = TestDataset("abcABC") name1 = dataset.addContig("abc") name2 = dataset.addContig("ABC") lines, dp, reads = dataset.genAll(self.aligner) line = lines[name1] sa = dataset.alphabet["a"].seq sb = dataset.alphabet["b"].seq tmp = Contig( sa + "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA" + sb, "tmp") al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)), line.prefix(len=len(sa))) al2 = AlignmentPiece.Identical( tmp.asSegment().suffix(length=len(sb)), line.segment(len(sa), len(sa) + len(sb))) al = AlignmentPiece.MergeFittingAlignments([al1, al2]) line.correctSequence([al]) assert str( list(dp.allInter(line.asSegment())) ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
def test1(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1") line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()), params.alignment_correction_radius) dp = LineDotPlot(lines, self.aligner) dp.addAlignment(al) alignment = AlignmentPiece.Identical( Contig("AGG", "tmp").asSegment(), line2.segment(0, 3)) line2.correctSequence([alignment]) assert str(list(dp.alignmentsToFrom[line2.id][ line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
def __init__(self, genome="", letter_size=550, error_rate=0.05, mutation_rate=0.005, seed=0): random.seed(seed) self.reads = [] # type: List[NamedSequence] self.disjointigs = [] # type: List[NamedSequence] self.contigs = [] # type: List[NamedSequence] self.letter_size = letter_size self.error_rate = error_rate self.mutation_rate = mutation_rate self.alphabet = ContigStorage() self.matches = dict() for c1, c2 in zip(ascii_lowercase, ascii_uppercase): seq = self.generate(self.letter_size) self.alphabet.add(Contig(seq, c1)) seq, matches = self.mutate(seq, self.mutation_rate) self.alphabet.add(Contig(seq, c2)) self.matches[c1] = matches self.matches[c2] = [(b, a) for a, b in matches] self.genome = Contig(self.translate(genome), genome)
def test3(self): lines = NewLineStorage(DisjointigCollection(), self.aligner) line = lines.addNew("ACGTACGTACGT", "c") dp = LineDotPlot(lines, self.aligner) al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12)) al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12)) dp.addAlignment(al1) dp.addAlignment(al2) alignment = AlignmentPiece.Identical( Contig("TCC", "tmp").asSegment(), line.segment(3, 6)) line.correctSequence([alignment]) assert str( list(dp.auto_alignments["c"]) ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
def correctSequence(self, alignments): # type: (Iterable[AlignmentPiece]) -> None sys.stdout.trace("Line operation Correct:", alignments) alignments = [al.cutIdenticalEnds() for al in alignments if al.seg_from.Seq() != al.seg_to.Seq()] if len(alignments) == 0: sys.stdout.trace("Skipping trivial correction operation") return assert len(alignments) > 0 correction = Correction.constructCorrection(alignments) self.notifyBeforeCorrect(correction) old = Contig(self.seq, "old") self.seq = correction.seq_from.seq self.rc.seq = basic.RC(self.seq) correction.changeQT(self, old) self.notifyAfterCorrect(correction)
def polishMany(self, reads, sequences): # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig] dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")]) consensus_file_name = new_files[0] reads_file_name = new_files[1] args = FakePolishingArgs() basic.ensure_dir_existance(os.path.join(dir, "work")) job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish") polished_file = job.out_files["contigs"] if same and not params.clean and os.path.exists(polished_file): sys.stdout.trace("Polishing reused:", polished_file) else: sys.stdout.trace("Running polishing:", polished_file) job.run() return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def draw(contigs_file, output_dir, k): aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments"))) CreateLog(output_dir) print "Reading contigs" tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")), key=lambda contig: len(contig)) lens = map(len, tmp)[::-1] print lens contigs = ContigStorage() if lens[1::2] == lens[0::2]: tmp = tmp[0::2] print "Removed extra contigs" for i, contig in enumerate(tmp): print i, contig contigs.add(Contig(contig.seq, str(i))) print "Constructing components" componenets = ExtractRepeatComponents(contigs, aligner, k) print "Components:" for comp in componenets: print comp.segments print comp.alignments for cnt, comp in enumerate(componenets): print "Processing component", cnt print comp.segments # print comp.alignments print "Forming blocks" Block.id_cnt = 0 blocks = CreateBlocks(comp) if len(blocks) == 1: print "Skipping trivial repeat" continue for block in blocks: print "Block", block.id, ":", block.segs for block in blocks: for other in block.out: print block.id, "->", other.id print "Placing blocks on X axis" code = placeX(blocks) if code == 1: print "WARNING: component", cnt, "contains cycle. Aborting visualization." continue print "Placing blocks on Y axis" placeY(blocks, comp.segments) print "Printing figure" SimplePrinter().printBlocks(blocks, sys.stdout) print "Finished printing figure"
def testManual(self): contig1 = Contig("ACGTAAAAGGGTACGT", "c1") contig2 = Contig("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(contig1.segment(5, 12), contig2.segment(5, 12)), params.alignment_correction_radius) corr = Correction(contig1, contig2, [al]) assert corr.mapPositionsUp(range(len(contig2))) == [ 0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15 ] assert corr.mapPositionsDown(range(len(contig1))) == [ 0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15 ] al2 = AlignmentPiece.Identical(contig2.segment(0, 4)) al3 = AlignmentPiece.Identical(contig2.segment(6, 8)) al4 = AlignmentPiece.Identical(contig2.segment(6, 16)) al5 = AlignmentPiece.Identical(contig2.segment(7, 16)) assert str( corr.composeQueryDifferences([al2, al3, al4, al5]) ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
def splitSegKmeans(aligner, seg, mult, all_reads_list): polisher = Polisher(aligner, aligner.dir_distributor) all_reads = ContigStorage() base = seg.asContig() tmp = [] rtv = readsToVectors(aligner, all_reads_list, base) kmeans = KMeans(n_clusters=mult, precompute_distances=True) recs = list(rtv.values()) result = kmeans.fit_predict(X=[rec.v for rec in recs]) print result clusters = dict() for i, c in enumerate(result): if c not in clusters: clusters[c] = [] clusters[c].append(recs[i].al) for c in clusters.values(): print str(c), ":", len(c) split_contigs = [] split_reads = [] for c in clusters.values(): split_contigs.append( Contig( polisher.polishSmallSegment(base.asSegment(), c).seg_from.Seq(), str(len(split_contigs)))) split_reads.append([al.seg_from.contig for al in c]) maxpi = 1 for i in range(mult): for j in range(mult): if i == j: sys.stdout.write("1.0 ") continue al = aligner.overlapAlign([split_contigs[i]], ContigStorage([split_contigs[j] ])).next() sys.stdout.write(str(al.percentIdentity()) + " ") maxpi = max(maxpi, al.percentIdentity()) print "" print "Maxpi:", maxpi if maxpi < 0.985: return zip(split_contigs, split_reads) else: return None
def correctSequence(self, alignments): # type: (Iterable[AlignmentPiece]) -> None sys.stdout.trace("Line operation Correct:", alignments) alignments = list(alignments) new_alignments = [] for al in alignments: if al.seg_from.Seq() == al.seg_to.Seq(): sys.stdout.trace("Skipping trivial correction alignment", al) else: new_alignments.append(al) if len(new_alignments) == 0: sys.stdout.trace("Skipping trivial correction operation") return assert len(alignments) > 0 correction = Correction.constructCorrection(alignments) self.notifyBeforeCorrect(correction) old = Contig(self.seq, "old") self.seq = correction.seq_from.seq self.rc.seq = basic.RC(self.seq) correction.changeQT(self, old) self.notifyAfterCorrect(correction)
def readsToVectors(aligner, reads_list, base): als = [] rtv = dict() polisher = Polisher(aligner, aligner.dir_distributor) for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])), base): if len(al.seg_to) < len(base) - 100: continue else: als.append(al) rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al)) reads_list = [al.seg_from.contig for al in als] bases = [base] for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]): base_candidate = Contig( polisher.polishSmallSegment( base.asSegment(), [base_al1, base_al2, base_al3]).seg_from.Seq(), str(len(bases))) rtr_als = [] read_ids = set() # base_candidate = base_al.seg_from.asContig() for al in fixAlDir( aligner.overlapAlign(reads_list, ContigStorage([base_candidate])), base_candidate): if len(al.seg_to) < len(base_candidate) - 100: continue else: rtr_als.append(al) read_ids.add(al.seg_from.contig.id) if len(read_ids) == len(als): bases.append(base_candidate) for al in rtr_als: rtv[al.seg_from.contig.id].extend(toVector(al)) if len(bases) > 10: break for rec in rtv.values(): print rec.read.id, len(rec.v), rec.v return rtv
def main(k, dir, contigs_file, reads_file): # type: (int, str, str, str) -> None basic.ensure_dir_existance(dir) CreateLog(dir) dd = DirDistributor(os.path.join(dir, "alignments")) aligner = Aligner(dd) params.k = k print "Loading contigs" tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"), False).unique(), key=lambda contig: len(contig)) cnt = 1 contigs = ContigStorage() for c1, c2 in zip(tmp[::2], tmp[1::2]): # if c1.seq == c2.rc.seq: contigs.add(Contig(c1.seq, str(cnt))) print cnt, c1.id, c2.id cnt += 1 # else: # contigs.add(Contig(c1.seq, str(cnt))) # print cnt, c1.id # cnt += 1 # contigs.add(Contig(c2.seq, str(cnt))) # print cnt, c2.id # cnt += 1 print "Loading reads" reads = ReadCollection().loadFromFasta(open(reads_file, "r")) print "Aligning reads" for al in aligner.localAlign(reads, contigs): if len(al) > k: read = al.seg_from.contig # type:AlignedRead read.addAlignment(al) res = open(os.path.join(dir, "reads.fasta"), "w") for read in reads: if not basic.isCanonocal(read.id): continue if len(read.alignments) > 1: SeqIO.write(read, res, "fasta") res.close()
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
dir = sys.argv[1] extra_params = sys.argv[4:] CreateLog(dir) dd = DirDistributor(dir) aligner = Aligner(dd) polisher = Polisher(aligner, dd) reads = ContigStorage().loadFromFasta(open(reads_file, "r"), num_names=False) ref = ContigStorage().loadFromFasta(open(consensus_file, "r"), num_names=False) if "accurate" in extra_params: res = [] als = sorted(aligner.overlapAlign(reads, ref), key=lambda al: al.seg_to.contig.id) for rid, rals in itertools.groupby(als, key=lambda al: al.seg_to.contig.id): if basic.isCanonocal(rid): contig = ref[rid] corrected_seq = polisher.polishSegment( contig.asSegment(), list(rals)).seg_from.Seq() res.append(Contig(corrected_seq, rid)) else: res = polisher.polishMany(reads, list(ref.unique())) res_file = os.path.join(dir, "res.fasta") rf = open(res_file, "w") for c in res: SeqIO.write(c, rf, "fasta") rf.close() aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap", os.path.join(dir, "res.sam"))