def testManual(self): contig1 = Contig("ACGTACGTA", "from") contig2 = Contig("ACTACGTACGTACAT", "to") al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8), "2M1I6M") al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15), "8M") glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2]) assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar assert glued.seg_from.Seq( ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M" assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
def polishSmallSegment(self, seg, als): # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece ok = False for al in als: if al.seg_to.contains(seg): ok = True if not ok: sys.stdout.log(common.log_params.LogPriority.warning, "Warning", seg, "has no covering reads") return AlignmentPiece.Identical(seg.asContig().asSegment(), seg) reads = [] start = basic.randomSequence(200) end = basic.randomSequence(200) for al in als: new_seq = "" al = al.reduce(target=seg) if al.seg_to.left < seg.left + 20: new_seq += start new_seq += al.seg_from.Seq() if al.seg_to.right > seg.right - 20: new_seq += end reads.append(NamedSequence(new_seq, al.seg_from.contig.id)) base = Contig(start + seg.Seq() + end, "base") polished = None try: polished = Contig(self.polish(reads, base), "polished") except PolishException: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "has a sequence very different from reads. Using reads to correct." ) for al, read in zip(als, reads): if al.seg_to.contains(seg): try: polished = Contig( self.polish(reads, Contig(read.seq, read.id)), "polished") break except PolishException: pass if polished is None: sys.stdout.log( common.log_params.LogPriority.warning, "Warning", seg, "could not be corrected even though some reads cover it.") polished = seg.asContig() als = list(self.aligner.overlapAlign([polished], ContigStorage([base]))) for al in als: if al.seg_from.left < 10 and al.rc.seg_from.left < 10: mapping = AlignmentPiece.Identical( base.segment(len(start), len(base) - len(end)), seg) return al.compose(mapping) assert False, "No alignment from polished to base: " + str(als)
def testManual(self): contig1 = Contig("ACGTAAAAGGGTACGT", "c1") contig2 = Contig("ACGTAAGGGGGTACGT", "c2") al = self.scorer.polyshAlignment( AlignmentPiece.Identical(contig1.segment(5, 12), contig2.segment(5, 12)), params.alignment_correction_radius) corr = Correction(contig1, contig2, [al]) assert corr.mapPositionsUp(range(len(contig2))) == [ 0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15 ] assert corr.mapPositionsDown(range(len(contig1))) == [ 0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15 ] al2 = AlignmentPiece.Identical(contig2.segment(0, 4)) al3 = AlignmentPiece.Identical(contig2.segment(6, 8)) al4 = AlignmentPiece.Identical(contig2.segment(6, 16)) al5 = AlignmentPiece.Identical(contig2.segment(7, 16)) assert str( corr.composeQueryDifferences([al2, al3, al4, al5]) ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length): base = Contig(seq, "base") for i in range(len(seq) / min_contig_length): res = splitSegKmeans( aligner, base.segment(i * min_contig_length, i * min_contig_length + min_contig_length), mult, all_reads_list) if res is not None: return res res = splitSegKmeans( aligner, base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult, all_reads_list) return res
def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None): # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]] if max_extension is None: max_extension = 10000000000 scorer = Scorer() contig = als[0].seg_to.contig max_len = max_extension + len(contig) sys.stdout.trace("Polishing end of", als[0].seg_to.contig) new_contig = contig.asSegment().asContig() relevant_als = [ al.changeTargetContig(new_contig) for al in als if al.rc.seg_to.left < 100 ] finished_als = [] while True: tmp = [] for al in relevant_als: if al.seg_to.inter(new_contig.asSegment().suffix( length=100)) and al.rc.seg_from.left > 100: tmp.append(al) else: finished_als.append(al) relevant_als = tmp if len(relevant_als) < min_cov: break start = "ACGTTCGA" + basic.randomSequence( params.flanking_size) + new_contig.asSegment().suffix( length=min(params.flanking_size, len(new_contig))).Seq() reduced_read_list = [ AlignedRead.new( start + al.seg_from.contig.asSegment().suffix( pos=al.seg_from.right).Seq(), str(i) + "_" + al.seg_from.contig.id) for i, al in enumerate(relevant_als) ] reduced_reads = ReadCollection(reduced_read_list) found = False for base_al in relevant_als: if base_al.rc.seg_from.left < params.flanking_size: continue # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500 base_segment = base_al.seg_from.contig.segment( base_al.seg_from.right, min( len(base_al.seg_from.contig), base_al.seg_from.right + max(params.window_size, params.k))) base = Contig(start + base_segment.Seq(), "base") for read in reduced_read_list: read.clean() polished_base = Contig(self.polish(reduced_reads, base), "polished_base") for al in self.aligner.localAlign( reduced_reads, ContigStorage().addAll([polished_base])): reduced_reads.reads[al.seg_from.contig.id].addAlignment(al) candidate_alignments = [] for read in reduced_read_list: candidate_alignments.append(None) for al in read.alignmentsTo(polished_base.asSegment()): if al.seg_to.left == 0 and ( (candidate_alignments[-1] is None or candidate_alignments[-1].seg_to.right < al.seg_to.right)): candidate_alignments[-1] = al trimmedAlignments = [] for i, al in enumerate(candidate_alignments): assert al is not None, reduced_read_list[i] trimmedAlignments.append(al.trimByQuality(0.4, 100)) contra_index = 0 contra = [] support = len(trimmedAlignments) cutoff_pos = len(start) for al in sorted(trimmedAlignments, key=lambda al: al.seg_to.right): while contra_index < len(contra) and contra[ contra_index].seg_to.right < al.seg_to.right - 50: contra_index += 1 if support >= min_cov and len(contra) - contra_index <= ( 1 - min_cov_frac) * support: cutoff_pos = al.seg_to.right support -= 1 if al.contradictingRTCRight(): contra.append(al) else: sys.stdout.trace("Stopped at:", support, contra_index, (1 - min_cov_frac) * support) break sys.stdout.trace("Positions:", [al.seg_to.right for al in trimmedAlignments]) sys.stdout.trace("Contra:", contra) if cutoff_pos > len(start) + 100: sys.stdout.trace("Chose to use read", base_al.__repr__(), "Extended for", cutoff_pos - len(start), "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) found = True new_contig_candidate = Contig( new_contig.seq + polished_base[len(start):cutoff_pos], "candidate") embedding = AlignmentPiece.Identical( polished_base.segment(len(start), cutoff_pos), new_contig_candidate.asSegment().suffix( pos=len(new_contig))) read_mappings = [] for al1, al2 in zip(candidate_alignments, relevant_als): seg_from = al2.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) seg_to = al1.seg_from.contig.asSegment().suffix( length=len(al1.seg_from.contig) - len(start)) read_mappings.append( AlignmentPiece.Identical(seg_from, seg_to)) embedded_alignments = [] for al1, al2 in zip(candidate_alignments, read_mappings): if al1.seg_to.right <= len(start) + 10: embedded_alignments.append(None) else: tmp = al2.compose(al1) if tmp.seg_to.left > embedding.seg_from.right - 10: embedded_alignments.append(None) else: embedded_alignments.append( tmp.compose(embedding)) corrected_relevant_alignments = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in relevant_als ] relevant_als = [] for al1, al2 in zip(corrected_relevant_alignments, embedded_alignments): if al2 is None: al = al1 else: al = al1.mergeDistant(al2) if al is None: al = al1 elif al1.seg_from.dist( al2.seg_from) >= 10 or al1.seg_to.dist( al2.seg_to) >= 10: al = scorer.polyshAlignment( al, params.alignment_correction_radius) relevant_als.append(al) finished_als = [ al.targetAsSegment( new_contig_candidate.asSegment().prefix( len(new_contig))) for al in finished_als ] new_contig = new_contig_candidate break else: sys.stdout.trace("Could not prolong with read", base_al, "Alignments:") sys.stdout.trace(map(str, reduced_read_list)) if len(new_contig) >= max_len: break if not found: break return new_contig, relevant_als + finished_als
def testManual(self): contig1 = Contig("ACGTACGTACGT", "from") contig2 = Contig("ACGTACGTACGT", "to") al1 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(0, 4)) al2 = AlignmentPiece.Identical(contig1.segment(0, 4), contig2.segment(4, 8)) al3 = AlignmentPiece.Identical(contig1.segment(4, 8), contig2.segment(8, 12)) storage = AlignmentStorage() storage.addAll([al1, al2, al3]) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" assert str( list(storage.rc) ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]" assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]" assert str(list(storage.filterByCoverage(0, 1))) == "[]" assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]" assert str(list(storage.filterByCoverage(2))) == "[]" storage.addAndMergeRight(al3) assert str( list(storage) ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]" al4 = AlignmentPiece.Identical(contig1.segment(2, 8), contig2.segment(2, 8)) al5 = AlignmentPiece.Identical(contig1.segment(4, 10), contig2.segment(4, 10)) storage.addAll([al4, al5]) assert str( list(storage.calculateCoverage()) ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]" assert str(list(storage.filterByCoverage( 2, 3))) == "[to[2:4], to[8:12-2]]" assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]" assert str( list(storage.getAlignmentsTo(contig2.segment(2, 3))) ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]" assert str(list(storage.getAlignmentsTo(contig2.segment( 2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
def testManual(self): contig = Contig("ACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 1)) storage.add(contig.segment(1, 2)) storage.add(contig.segment(2, 3)) storage.add(contig.segment(3, 4)) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) assert str( storage.rc ) == "ReadStorage-:[-test[0:1], -test[1:2], -test[2:4-1], -test[3:4-0]]", str( storage.rc) storage.mergeSegments(1) assert str( storage ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str( storage) storage.mergeSegments() assert str(storage) == "ReadStorage+:[test[0:4-0]]", str(storage) assert str(storage.rc) == "ReadStorage-:[-test[0:4-0]]", str( storage.rc) contig = Contig("ACGTACGTACGTACGT", "test") storage = SegmentStorage() storage.add(contig.segment(0, 5)) storage.add(contig.segment(10, 15)) assert storage.find(contig.segment(5, 10)) == contig.segment( 0, 5), str(storage.find(contig.segment(5, 10))) assert storage.find(contig.segment(6, 10)) == contig.segment( 10, 15), str(storage.find(contig.segment(6, 10))) assert storage.find(contig.segment(5, 9)) == contig.segment(0, 5), str( storage.find(contig.segment(5, 9))) assert storage.find(contig.segment(0, 16)) == contig.segment( 0, 5), str(storage.find(contig.segment(0, 16)))