Beispiel #1
0
 def testManual(self):
     contig1 = Contig("ACGTACGTA", "from")
     contig2 = Contig("ACTACGTACGTACAT", "to")
     al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8),
                          "2M1I6M")
     al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15),
                          "8M")
     glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2])
     assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar
     assert glued.seg_from.Seq(
     ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar
     assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
Beispiel #2
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
Beispiel #3
0
 def testManual(self):
     contig1 = Contig("ACGTAAAAGGGTACGT", "c1")
     contig2 = Contig("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(contig1.segment(5, 12),
                                  contig2.segment(5, 12)),
         params.alignment_correction_radius)
     corr = Correction(contig1, contig2, [al])
     assert corr.mapPositionsUp(range(len(contig2))) == [
         0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15
     ]
     assert corr.mapPositionsDown(range(len(contig1))) == [
         0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15
     ]
     al2 = AlignmentPiece.Identical(contig2.segment(0, 4))
     al3 = AlignmentPiece.Identical(contig2.segment(6, 8))
     al4 = AlignmentPiece.Identical(contig2.segment(6, 16))
     al5 = AlignmentPiece.Identical(contig2.segment(7, 16))
     assert str(
         corr.composeQueryDifferences([al2, al3, al4, al5])
     ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
Beispiel #5
0
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length):
    base = Contig(seq, "base")
    for i in range(len(seq) / min_contig_length):
        res = splitSegKmeans(
            aligner,
            base.segment(i * min_contig_length,
                         i * min_contig_length + min_contig_length), mult,
            all_reads_list)
        if res is not None:
            return res
    res = splitSegKmeans(
        aligner,
        base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult,
        all_reads_list)
    return res
Beispiel #6
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
Beispiel #7
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "from")
     contig2 = Contig("ACGTACGTACGT", "to")
     al1 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(0, 4))
     al2 = AlignmentPiece.Identical(contig1.segment(0, 4),
                                    contig2.segment(4, 8))
     al3 = AlignmentPiece.Identical(contig1.segment(4, 8),
                                    contig2.segment(8, 12))
     storage = AlignmentStorage()
     storage.addAll([al1, al2, al3])
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     assert str(
         list(storage.rc)
     ) == "[(-from[4:12-4]->-to[0:4]:1.000), (-from[8:12-0]->-to[4:12-4]:1.000), (-from[8:12-0]->-to[8:12-0]:1.000)]"
     assert str(list(storage.calculateCoverage())) == "[(to[0:12-0], 1)]"
     assert str(list(storage.filterByCoverage(0, 1))) == "[]"
     assert str(list(storage.filterByCoverage(1, 2))) == "[to[0:12-0]]"
     assert str(list(storage.filterByCoverage(2))) == "[]"
     storage.addAndMergeRight(al3)
     assert str(
         list(storage)
     ) == "[(from[0:4]->to[0:4]:1.000), (from[0:4]->to[4:12-4]:1.000), (from[4:12-4]->to[8:12-0]:1.000)]"
     al4 = AlignmentPiece.Identical(contig1.segment(2, 8),
                                    contig2.segment(2, 8))
     al5 = AlignmentPiece.Identical(contig1.segment(4, 10),
                                    contig2.segment(4, 10))
     storage.addAll([al4, al5])
     assert str(
         list(storage.calculateCoverage())
     ) == "[(to[0:2], 1), (to[2:4], 2), (to[4:12-4], 3), (to[8:12-2], 2), (to[10:12-0], 1)]"
     assert str(list(storage.filterByCoverage(
         2, 3))) == "[to[2:4], to[8:12-2]]"
     assert str(list(storage.filterByCoverage(2))) == "[to[2:12-2]]"
     assert str(
         list(storage.getAlignmentsTo(contig2.segment(2, 3)))
     ) == "[(from[0:4]->to[0:4]:1.000), (from[2:12-4]->to[2:12-4]:1.000)]"
     assert str(list(storage.getAlignmentsTo(contig2.segment(
         2, 6)))) == "[(from[2:12-4]->to[2:12-4]:1.000)]"
Beispiel #8
0
 def testManual(self):
     contig = Contig("ACGT", "test")
     storage = SegmentStorage()
     storage.add(contig.segment(0, 1))
     storage.add(contig.segment(1, 2))
     storage.add(contig.segment(2, 3))
     storage.add(contig.segment(3, 4))
     assert str(
         storage
     ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str(
         storage)
     assert str(
         storage.rc
     ) == "ReadStorage-:[-test[0:1], -test[1:2], -test[2:4-1], -test[3:4-0]]", str(
         storage.rc)
     storage.mergeSegments(1)
     assert str(
         storage
     ) == "ReadStorage+:[test[0:1], test[1:2], test[2:4-1], test[3:4-0]]", str(
         storage)
     storage.mergeSegments()
     assert str(storage) == "ReadStorage+:[test[0:4-0]]", str(storage)
     assert str(storage.rc) == "ReadStorage-:[-test[0:4-0]]", str(
         storage.rc)
     contig = Contig("ACGTACGTACGTACGT", "test")
     storage = SegmentStorage()
     storage.add(contig.segment(0, 5))
     storage.add(contig.segment(10, 15))
     assert storage.find(contig.segment(5, 10)) == contig.segment(
         0, 5), str(storage.find(contig.segment(5, 10)))
     assert storage.find(contig.segment(6, 10)) == contig.segment(
         10, 15), str(storage.find(contig.segment(6, 10)))
     assert storage.find(contig.segment(5, 9)) == contig.segment(0, 5), str(
         storage.find(contig.segment(5, 9)))
     assert storage.find(contig.segment(0, 16)) == contig.segment(
         0, 5), str(storage.find(contig.segment(0, 16)))