Beispiel #1
0
 def polishAndAnalyse(self, reads, polishing_base, reliable_start = None):
     # type: (ReadCollection, Contig, Optional[int]) -> Consensus
     if reliable_start is None:
         reliable_start = len(polishing_base)
     seq = Contig(self.polish(reads, polishing_base), "contig")
     res = [0] * (len(seq) + 1)
     alignment = ReadCollection().extendClean(reads)
     self.aligner.alignReadCollection(alignment, [seq])
     contra = 0
     ok = 0
     late = 0
     for read in alignment:
         for al in read.alignmentsTo(seq.asSegment()):# type: AlignmentPiece
             if al.contradicting(seq.asSegment()):
                 contra += 1
             elif al.seg_to.left > reliable_start:
                 late += 1
             else:
                 res[al.seg_to.left] += 1
                 res[al.seg_to.right] -= 1
                 ok += 1
     for i in range(1, len(res)):
         res[i] += res[i - 1]
     sys.stdout.trace("Polyshed and analysed using", len(alignment), "reads. Ok:", ok, "late:", late, "contra:", contra)
     # if contra > 10 or contra > ok / 2:
     #     for read in alignment:
     #         print read
     #         for al in read.alignmentsTo(seq.asSegment()):
     #             if al.contradictingRTC(seq.asSegment()):
     #                 print "contra_al:", al
     #             elif al.seg_to.left > reliable_start:
     #                 print "late_al:", al
     #             else:
     #                 print "ok_al:", al
     return Consensus(seq.seq, res)
Beispiel #2
0
 def __init__(self, seq, id, extension_handler, rc = None):
     # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None
     self.extensionHandler = extension_handler
     self.seq = seq
     self.id = id # type: str
     self.circular = False
     self.name_printer = None
     self.max_extension = False
     if rc is None:
         self.initial = AlignmentStorage()
         self.correct_segments = SegmentStorage()
         self.completely_resolved = SegmentStorage()
         self.disjointig_alignments = AlignmentStorage()
         self.read_alignments = ReadAlignmentStorage()
         self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener]
         rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine
         self.rc = rc
         self.addListener(ReadAlignmentListener(self))
         # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment()))
     else:
         self.initial = rc.initial.rc # type: AlignmentStorage
         self.correct_segments = rc.correct_segments.rc # type: SegmentStorage
         self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage
         self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage
         self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage
         self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener]
     Contig.__init__(self, seq, id, rc)
     self.rc = rc #type: NewLine
     self.knot = None # type: Knot
def align(dir, contigs_file):
    CreateLog(dir)
    contigs = list(SeqIO.parse_fasta(open(contigs_file, "r")))
    assert len(contigs) == 2
    contigs = [
        Contig(contigs[0].seq, contigs[0].id),
        Contig(contigs[1].seq, contigs[1].id)
    ]
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    als = iter_align(aligner, contigs[0], contigs[1])
    printVar(os.path.join(dir, "diff.txt"), als)
    for al in als:
        print al
Beispiel #4
0
 def __init__(self, seq, id, rc=None):
     # type: (str, str, Optional[Disjointig]) -> None
     self.seq = seq
     self.id = id
     if rc is None:
         self.read_alignments = AlignmentStorage()  # type: AlignmentStorage
         rc = Disjointig(basic.RC(seq), basic.Reverse(id),
                         self)  # type: Disjointig
         self.rc = rc
     else:
         self.rc = rc
         self.read_alignments = self.rc.read_alignments.rc  # type: AlignmentStorage
     Contig.__init__(self, seq, id, rc)
     self.rc = rc  # type:Disjointig
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
Beispiel #6
0
 def testManual(self):
     contig1 = Contig("ACGTACGTA", "from")
     contig2 = Contig("ACTACGTACGTACAT", "to")
     al1 = AlignmentPiece(contig1.asSegment(), contig2.segment(0, 8),
                          "2M1I6M")
     al2 = AlignmentPiece(contig1.segment(0, 8), contig2.segment(7, 15),
                          "8M")
     glued = AlignmentPiece.GlueOverlappingAlignments([al1, al2])
     assert glued.cigar == "2M1I5M8M", str(glued) + " " + glued.cigar
     assert glued.seg_from.Seq(
     ) == "ACGTACGTACGTACGT", str(glued) + " " + glued.cigar
     assert al1.reduce(query=contig1.segment(0, 2)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 3)).cigar == "2M"
     assert al1.reduce(query=contig1.segment(0, 4)).cigar == "2M1I1M"
Beispiel #7
0
def splitRepeat(aligner, seq, mult, all_reads_list, min_contig_length):
    base = Contig(seq, "base")
    for i in range(len(seq) / min_contig_length):
        res = splitSegKmeans(
            aligner,
            base.segment(i * min_contig_length,
                         i * min_contig_length + min_contig_length), mult,
            all_reads_list)
        if res is not None:
            return res
    res = splitSegKmeans(
        aligner,
        base.asSegment().suffix(length=min(min_contig_length, len(seq))), mult,
        all_reads_list)
    return res
Beispiel #8
0
 def constructCorrection(alignments):
     # type: (List[AlignmentPiece]) -> Correction
     initial = alignments[0].seg_to.contig
     alignments = sorted(alignments, key=lambda al: al.seg_to.left)
     sb = []
     pos = initial.left()
     new_pos = 0
     for al in alignments:
         sb.append(initial.subSequence(pos, al.seg_to.left).seq)
         new_pos += al.seg_to.left - pos
         pos = al.seg_to.left
         sb.append(al.seg_from.Seq())
         new_pos += al.seg_from.__len__()
         pos = al.seg_to.right
     sb.append(
         initial.segment(alignments[-1].seg_to.right,
                         initial.right()).Seq())
     new_pos += initial.right() - alignments[-1].seg_to.right
     new_seq = Contig("".join(sb), "TMP1_" + initial.id)
     new_als = []
     pos = initial.left()
     new_pos = 0
     for al in alignments:
         new_pos += al.seg_to.left - pos
         new_seg_from = Segment(new_seq, new_pos,
                                new_pos + al.seg_from.__len__())
         new_als.append(al.changeQuerySegment(new_seg_from))
         pos = al.seg_to.right
         new_pos += al.seg_from.__len__()
     return Correction(new_seq, initial, new_als)
def recruit(seqs, reads, k, dir):
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    relevant_reads = ContigStorage()
    disjointigs = seqs
    for i in range(2):
        sys.stdout.info("Recruiting iteration", i)
        als = filter(lambda al: len(al) > k, aligner.localAlign(reads, disjointigs))
        print len(als), "alignments"
        relevant_reads = alsToReads(als)
        l = sum(map(len, seqs.unique()))
        disjointigs = constructDisjointigs(relevant_reads, l, dd.nextDir())
        print len(disjointigs), "disjointigs"
        print disjointigs
    disjointigs.writeToFasta(open(os.path.join(dir, "disjointigs.fasta"), "w"))
    relevant_reads.writeToFasta(open(os.path.join(dir, "reads.fasta"), "w"))
    sys.stdout.info("Aligning repeat sequences to disjointigs")
    als = list(aligner.localAlign(seqs, disjointigs))
    print "\n".join(map(str, als))
    starts = dict()
    for dis in disjointigs:
        starts[dis.id] = len(dis)
    for al in als:
        if len(al) > k:
            starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left)
            al = al.rc
            starts[al.seg_to.contig.id] = min(starts[al.seg_to.contig.id], al.seg_to.left)
    print "Starts:"
    for cid, val in starts.items():
        print cid, val
    contigs = ContigStorage()
    cnt = 1
    for dis in disjointigs:
        if starts[dis.id] > k and starts[dis.id] < len(dis):
            print cnt, dis.id, starts[dis.id]
            contigs.add(Contig(dis.prefix(starts[dis.id]).Seq(), str(cnt)))
            cnt += 1
    for dis in disjointigs.unique():
        if len(dis) > k and starts[dis.id] == len(dis):
            print cnt, dis.id
            contigs.add(Contig(dis.seq, str(cnt)))
            cnt += 1
    contigs.writeToFasta(open(os.path.join(dir, "contigs.fasta"), "w"))
    fakeGraph(contigs, open(os.path.join(dir, "graph.gv"), "w"))
Beispiel #10
0
 def testManual(self):
     contig1 = Contig("ACGTACGTACGT", "c1")
     contig2 = Contig("ACGTAGGTACGT", "c2")
     contig3 = Contig("ACTTACGTACGT", "c3")
     al1 = AlignmentPiece.Identical(contig1.asSegment(),
                                    contig2.asSegment())
     al2 = AlignmentPiece.Identical(contig2.asSegment(),
                                    contig3.asSegment())
     al3 = al1.compose(al2)
     assert al3.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al3.cigar == "12M"
     al4 = al1.reverse()
     al5 = al4.composeTargetDifference(al2)
     assert al5.__repr__() == "(c1[0:12-0]->c3[0:12-0]:0.92)"
     assert al5.cigar == "12M"
Beispiel #11
0
 def alignReadsToSegments(self, reads, segments):
     # type: (ReadCollection, Iterable[Segment]) -> None
     segments = list(segments)
     seg_dict = dict()
     for i, seg in enumerate(segments):
         seg_dict[str(i + 1)] = seg
     contigs = map(lambda (i, seg): Contig(seg.Seq(), str(i + 1)), enumerate(segments))
     read_collection = ReadCollection().extendClean(reads)
     self.alignReadCollection(read_collection, ContigCollection(contigs))
     read_collection.contigsAsSegments(seg_dict)
     reads.mergeAlignments(read_collection)
Beispiel #12
0
 def cutRight(self, pos):
     sys.stdout.trace("Line operation Cut:", self, pos)
     assert pos > 0 and pos <= len(self)
     cut_length = len(self) - pos
     if cut_length == 0:
         return
     new_seq = Contig(self.seq[:pos], "TMP3_" + self.id)
     self.notifyBeforeCutRight(new_seq, pos)
     self.seq = self.seq[:-cut_length]
     self.rc.seq = self.rc.seq[cut_length:]
     self.notifyAfterCutRight(pos)
Beispiel #13
0
 def extendRight(self, seq, relevant_als=None):
     # type: (str, List[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Extend:", self, len(seq),
                      relevant_als)
     assert self.knot is None
     if relevant_als is None:
         relevant_als = []
     new_seq = Contig(self.seq + seq, "TMP2_" + self.id)
     self.notifyBeforeExtendRight(new_seq, seq)
     self.seq = self.seq + seq
     self.rc.seq = basic.RC(seq) + self.rc.seq
     self.notifyAfterExtendRight(seq, relevant_als)
Beispiel #14
0
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump):
    # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None
    sys.stdout.info("Extending short lines")
    short_contigs = ContigStorage()
    als = dict() # type: Dict[str, List[AlignmentPiece]]
    for contig in contigs.unique():
        if len(contig) < params.k + 500:
            short_contigs.add(contig)
            als[contig.id] = []
            als[contig.rc.id] = []

    if read_dump is not None:
        sys.stdout.trace("Using flye read dump file to extend short contigs")
        relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads)
        for contig in short_contigs:
            for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])):
                als[al.seg_to.contig.id].append(al)
                als[al.seg_to.contig.rc.id].append(al.rc)
    else:
        sys.stdout.trace("Realigning all reads to extend short contigs")
        for al in aligner.overlapAlign(reads, short_contigs):
            if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20:
                added = False
                for i, al1 in enumerate(als[al.seg_to.contig.id]):
                    if al1.seg_from.contig.id == al.seg_from.contig.id:
                        added = True
                        if al.percentIdentity() > al1.percentIdentity():
                            als[al.seg_to.contig.id][i] = al
                            als[al.seg_to.contig.rc.id][i] = al.rc
                        break
                if not added:
                    als[al.seg_to.contig.id].append(al)
                    als[al.seg_to.contig.rc.id].append(al.rc)
    for contig in short_contigs.unique():
        if len(als[contig.id]) > 0:
            tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig))
            r = len(tmp_contig) - len(contig)
            tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig))
            l = len(tmp_contig) - len(contig) - r
        else:
            tmp_contig, new_als = contig, als[contig.id]
            l = 0
            r = 0
#        if l > params.k / 2 and r > params.k / 2:
#            tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2]
#        else:
#            tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)]
        if len(tmp_contig) > params.k + 500:
            sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right")
            contigs.add(Contig(tmp_contig.rc.seq, contig.id))
        else:
            sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.")
            contigs.remove(contig)
Beispiel #15
0
 def test5(self):
     dataset = TestDataset("abcABC")
     name1 = dataset.addContig("abc")
     name2 = dataset.addContig("ABC")
     lines, dp, reads = dataset.genAll(self.aligner)
     line = lines[name1]
     sa = dataset.alphabet["a"].seq
     sb = dataset.alphabet["b"].seq
     tmp = Contig(
         sa +
         "ACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGAACGACAGTAACTTGA"
         + sb, "tmp")
     al1 = AlignmentPiece.Identical(tmp.prefix(len=len(sa)),
                                    line.prefix(len=len(sa)))
     al2 = AlignmentPiece.Identical(
         tmp.asSegment().suffix(length=len(sb)),
         line.segment(len(sa),
                      len(sa) + len(sb)))
     al = AlignmentPiece.MergeFittingAlignments([al1, al2])
     line.correctSequence([al])
     assert str(
         list(dp.allInter(line.asSegment()))
     ) == "[(C0_abc[0:1755-0]->C0_abc[0:1755-0]:1.000), (C1_ABC[0:1652-0]->C0_abc[0:1755-0]:0.94)]"
Beispiel #16
0
 def test1(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line1 = lines.addNew("ACGTAAAAGGGTACGT", "c1")
     line2 = lines.addNew("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(line1.asSegment(), line2.asSegment()),
         params.alignment_correction_radius)
     dp = LineDotPlot(lines, self.aligner)
     dp.addAlignment(al)
     alignment = AlignmentPiece.Identical(
         Contig("AGG", "tmp").asSegment(), line2.segment(0, 3))
     line2.correctSequence([alignment])
     assert str(list(dp.alignmentsToFrom[line2.id][
         line1.id])) == "[(c1[0:16-0]->c2[0:16-0]:0.81)]"
 def __init__(self,
              genome="",
              letter_size=550,
              error_rate=0.05,
              mutation_rate=0.005,
              seed=0):
     random.seed(seed)
     self.reads = []  # type: List[NamedSequence]
     self.disjointigs = []  # type: List[NamedSequence]
     self.contigs = []  # type: List[NamedSequence]
     self.letter_size = letter_size
     self.error_rate = error_rate
     self.mutation_rate = mutation_rate
     self.alphabet = ContigStorage()
     self.matches = dict()
     for c1, c2 in zip(ascii_lowercase, ascii_uppercase):
         seq = self.generate(self.letter_size)
         self.alphabet.add(Contig(seq, c1))
         seq, matches = self.mutate(seq, self.mutation_rate)
         self.alphabet.add(Contig(seq, c2))
         self.matches[c1] = matches
         self.matches[c2] = [(b, a) for a, b in matches]
     self.genome = Contig(self.translate(genome), genome)
Beispiel #18
0
 def test3(self):
     lines = NewLineStorage(DisjointigCollection(), self.aligner)
     line = lines.addNew("ACGTACGTACGT", "c")
     dp = LineDotPlot(lines, self.aligner)
     al1 = AlignmentPiece.Identical(line.segment(0, 8), line.segment(4, 12))
     al2 = AlignmentPiece.Identical(line.segment(0, 4), line.segment(8, 12))
     dp.addAlignment(al1)
     dp.addAlignment(al2)
     alignment = AlignmentPiece.Identical(
         Contig("TCC", "tmp").asSegment(), line.segment(3, 6))
     line.correctSequence([alignment])
     assert str(
         list(dp.auto_alignments["c"])
     ) == "[(c[1:12-4]->c[5:12-0]:0.86), (c[0:4]->c[8:12-0]:1.000), (c[5:12-0]->c[1:12-4]:0.86), (c[8:12-0]->c[0:4]:1.000), (c[0:12-0]->c[0:12-0]:1.000)]"
Beispiel #19
0
 def correctSequence(self, alignments):
     # type: (Iterable[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Correct:", alignments)
     alignments = [al.cutIdenticalEnds() for al in alignments if al.seg_from.Seq() != al.seg_to.Seq()]
     if len(alignments) == 0:
         sys.stdout.trace("Skipping trivial correction operation")
         return
     assert len(alignments) > 0
     correction = Correction.constructCorrection(alignments)
     self.notifyBeforeCorrect(correction)
     old = Contig(self.seq, "old")
     self.seq = correction.seq_from.seq
     self.rc.seq = basic.RC(self.seq)
     correction.changeQT(self, old)
     self.notifyAfterCorrect(correction)
Beispiel #20
0
 def polishMany(self, reads, sequences):
     # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig]
     dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
Beispiel #21
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
def draw(contigs_file, output_dir, k):
    aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments")))
    CreateLog(output_dir)
    print "Reading contigs"
    tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")),
                 key=lambda contig: len(contig))
    lens = map(len, tmp)[::-1]
    print lens
    contigs = ContigStorage()
    if lens[1::2] == lens[0::2]:
        tmp = tmp[0::2]
        print "Removed extra contigs"
    for i, contig in enumerate(tmp):
        print i, contig
        contigs.add(Contig(contig.seq, str(i)))
    print "Constructing components"
    componenets = ExtractRepeatComponents(contigs, aligner, k)
    print "Components:"
    for comp in componenets:
        print comp.segments
        print comp.alignments
    for cnt, comp in enumerate(componenets):
        print "Processing component", cnt
        print comp.segments
        # print comp.alignments
        print "Forming blocks"
        Block.id_cnt = 0
        blocks = CreateBlocks(comp)
        if len(blocks) == 1:
            print "Skipping trivial repeat"
            continue
        for block in blocks:
            print "Block", block.id, ":", block.segs
        for block in blocks:
            for other in block.out:
                print block.id, "->", other.id
        print "Placing blocks on X axis"
        code = placeX(blocks)
        if code == 1:
            print "WARNING: component", cnt, "contains cycle. Aborting visualization."
            continue
        print "Placing blocks on Y axis"
        placeY(blocks, comp.segments)
        print "Printing figure"
        SimplePrinter().printBlocks(blocks, sys.stdout)
        print "Finished printing figure"
Beispiel #23
0
 def testManual(self):
     contig1 = Contig("ACGTAAAAGGGTACGT", "c1")
     contig2 = Contig("ACGTAAGGGGGTACGT", "c2")
     al = self.scorer.polyshAlignment(
         AlignmentPiece.Identical(contig1.segment(5, 12),
                                  contig2.segment(5, 12)),
         params.alignment_correction_radius)
     corr = Correction(contig1, contig2, [al])
     assert corr.mapPositionsUp(range(len(contig2))) == [
         0, 1, 2, 3, 4, 5, 8, 9, 9, 9, 10, 11, 12, 13, 14, 15
     ]
     assert corr.mapPositionsDown(range(len(contig1))) == [
         0, 1, 2, 3, 4, 5, 6, 6, 6, 9, 10, 11, 12, 13, 14, 15
     ]
     al2 = AlignmentPiece.Identical(contig2.segment(0, 4))
     al3 = AlignmentPiece.Identical(contig2.segment(6, 8))
     al4 = AlignmentPiece.Identical(contig2.segment(6, 16))
     al5 = AlignmentPiece.Identical(contig2.segment(7, 16))
     assert str(
         corr.composeQueryDifferences([al2, al3, al4, al5])
     ) == "[(c2[0:4]->c1[0:4]:1.000), (c2[6:7]->c1[8:9]:1.000), (c2[6:16-0]->c1[8:16-0]:0.80), (c2[9:16-0]->c1[9:16-0]:1.000)]"
Beispiel #24
0
def splitSegKmeans(aligner, seg, mult, all_reads_list):
    polisher = Polisher(aligner, aligner.dir_distributor)
    all_reads = ContigStorage()
    base = seg.asContig()
    tmp = []
    rtv = readsToVectors(aligner, all_reads_list, base)
    kmeans = KMeans(n_clusters=mult, precompute_distances=True)
    recs = list(rtv.values())
    result = kmeans.fit_predict(X=[rec.v for rec in recs])
    print result
    clusters = dict()
    for i, c in enumerate(result):
        if c not in clusters:
            clusters[c] = []
        clusters[c].append(recs[i].al)
    for c in clusters.values():
        print str(c), ":", len(c)
    split_contigs = []
    split_reads = []
    for c in clusters.values():
        split_contigs.append(
            Contig(
                polisher.polishSmallSegment(base.asSegment(),
                                            c).seg_from.Seq(),
                str(len(split_contigs))))
        split_reads.append([al.seg_from.contig for al in c])
    maxpi = 1
    for i in range(mult):
        for j in range(mult):
            if i == j:
                sys.stdout.write("1.0 ")
                continue
            al = aligner.overlapAlign([split_contigs[i]],
                                      ContigStorage([split_contigs[j]
                                                     ])).next()
            sys.stdout.write(str(al.percentIdentity()) + " ")
            maxpi = max(maxpi, al.percentIdentity())
        print ""
    print "Maxpi:", maxpi
    if maxpi < 0.985:
        return zip(split_contigs, split_reads)
    else:
        return None
Beispiel #25
0
 def correctSequence(self, alignments):
     # type: (Iterable[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Correct:", alignments)
     alignments = list(alignments)
     new_alignments = []
     for al in alignments:
         if al.seg_from.Seq() == al.seg_to.Seq():
             sys.stdout.trace("Skipping trivial correction alignment", al)
         else:
             new_alignments.append(al)
     if len(new_alignments) == 0:
         sys.stdout.trace("Skipping trivial correction operation")
         return
     assert len(alignments) > 0
     correction = Correction.constructCorrection(alignments)
     self.notifyBeforeCorrect(correction)
     old = Contig(self.seq, "old")
     self.seq = correction.seq_from.seq
     self.rc.seq = basic.RC(self.seq)
     correction.changeQT(self, old)
     self.notifyAfterCorrect(correction)
Beispiel #26
0
def readsToVectors(aligner, reads_list, base):
    als = []
    rtv = dict()
    polisher = Polisher(aligner, aligner.dir_distributor)
    for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])),
                       base):
        if len(al.seg_to) < len(base) - 100:
            continue
        else:
            als.append(al)
            rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al))
    reads_list = [al.seg_from.contig for al in als]
    bases = [base]
    for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]):
        base_candidate = Contig(
            polisher.polishSmallSegment(
                base.asSegment(),
                [base_al1, base_al2, base_al3]).seg_from.Seq(),
            str(len(bases)))
        rtr_als = []
        read_ids = set()
        #        base_candidate = base_al.seg_from.asContig()
        for al in fixAlDir(
                aligner.overlapAlign(reads_list,
                                     ContigStorage([base_candidate])),
                base_candidate):
            if len(al.seg_to) < len(base_candidate) - 100:
                continue
            else:
                rtr_als.append(al)
                read_ids.add(al.seg_from.contig.id)
        if len(read_ids) == len(als):
            bases.append(base_candidate)
            for al in rtr_als:
                rtv[al.seg_from.contig.id].extend(toVector(al))
            if len(bases) > 10:
                break
    for rec in rtv.values():
        print rec.read.id, len(rec.v), rec.v
    return rtv
Beispiel #27
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
Beispiel #28
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
Beispiel #29
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
Beispiel #30
0
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
    res_file = os.path.join(dir, "res.fasta")
    rf = open(res_file, "w")
    for c in res:
        SeqIO.write(c, rf, "fasta")
    rf.close()
    aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap",
                        os.path.join(dir, "res.sam"))