コード例 #1
0
 def __init__(self, aligner, knotter, disjointigs, dot_plot):
     # type: (Aligner, LineMerger, DisjointigCollection, LineDotPlot) -> None
     self.aligner = aligner
     self.polisher = Polisher(aligner, aligner.dir_distributor)
     self.knotter = knotter
     self.disjointigs = disjointigs
     self.dot_plot = dot_plot
     self.scorer = Scorer()
コード例 #2
0
 def add(self, al):
     # type: (AlignmentPiece) -> None
     tmp = list(al.split(100))
     if len(tmp) > 1:
         al = Scorer().polyshAlignment(
             al, params.alignment_correction_radius)
         for al1 in al.split(100):
             self.innerAdd(al)
     else:
         self.innerAdd(al)
コード例 #3
0
 def __init__(self, aligner, knotter, disjointigs, dot_plot, reads,
              recruiter):
     # type: (Aligner, LineMerger, DisjointigCollection, LineDotPlot, ReadCollection, PairwiseReadRecruiter) -> None
     self.aligner = aligner
     self.polisher = Polisher(aligner, aligner.dir_distributor)
     self.knotter = knotter
     self.disjointigs = disjointigs
     self.dot_plot = dot_plot
     self.scorer = Scorer()
     self.reads = reads
     self.recruiter = recruiter
コード例 #4
0
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
コード例 #5
0
 def testManual(self):
     random.seed(0)
     arr = []
     tmp_arr = []
     for i in range(1000):
         arr.append((random.randint(0, 100), i))
         tmp_arr.append(arr[-1][0])
     res = Scorer().maxInRange(arr, 50)
     for i, val in enumerate(res):
         assert val == max(tmp_arr[max(i - 50, 0):min(i + 50 +
                                                      1, len(arr))])
コード例 #6
0
def toVector(al):
    res = []
    contig = al.seg_to.contig
    m = al.matchingSequence(True)
    tmp = []
    for i in range(len(contig) / w + 1):
        tmp.append([])
    for a, b in m.matches:
        tmp[b / w].append((a, b))
    for i in range(len(contig) / w):
        if i + 1 < len(tmp) and len(tmp[i + 1]) > 0:
            tmp[i].append(tmp[i + 1][0])
    for i in range(2, len(contig) / w - 2):
        seg = contig.segment(i * w, i * w + w)
        if len(tmp[i]) < 2:
            res.append(w / 2)
        else:
            ms = MatchingSequence(al.seg_from.contig.seq, al.seg_to.contig.seq,
                                  tmp[i])
            res.append(Scorer().accurateScore(ms, 10))
    return res
コード例 #7
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
コード例 #8
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
コード例 #9
0
class LineExtender:
    def __init__(self, aligner, knotter, disjointigs, dot_plot, reads,
                 recruiter):
        # type: (Aligner, LineMerger, DisjointigCollection, LineDotPlot, ReadCollection, PairwiseReadRecruiter) -> None
        self.aligner = aligner
        self.polisher = Polisher(aligner, aligner.dir_distributor)
        self.knotter = knotter
        self.disjointigs = disjointigs
        self.dot_plot = dot_plot
        self.scorer = Scorer()
        self.reads = reads
        self.recruiter = recruiter

    def checkAlignments(self, seg, als):
        # type: (Segment,List[AlignmentPiece]) -> None
        rids = set([al.seg_from.contig.id for al in als])
        for al in self.aligner.localAlign(self.reads,
                                          ContigStorage([seg.contig])):
            if al.seg_to.interSize(
                    seg) > params.k and al.seg_from.contig.id not in rids:
                print "Missing alignment", al

    def processLine(self, line):
        # type: (NewLine) -> int
        line.completely_resolved.mergeSegments(params.k)
        bound = LinePosition(line, line.left())
        new_recruits = 0
        new_line = self.knotter.tryMergeRight(line)
        if new_line is not None:
            self.updateAllStructures(list(new_line.completely_resolved))
            return 1
        self.updateAllStructures(line.completely_resolved)
        while True:
            seg_to_resolve = line.completely_resolved.find(
                bound.suffix(), params.k)
            if seg_to_resolve is None:
                break
            if line.knot is not None and seg_to_resolve.right == len(line):
                break
            if seg_to_resolve.right <= line.initial[0].seg_to.left + params.k:
                bound = LinePosition(line, seg_to_resolve.right - params.k + 1)
                continue
            result = self.attemptCleanResolution(seg_to_resolve)
            total = sum([len(arr) for seg, arr in result])
            new_recruits += total
            if total == 0:
                bound = LinePosition(line, seg_to_resolve.right - params.k + 1)
                continue
            self.updateAllStructures([seg for seg, arr in result])
            new_line = self.knotter.tryMergeRight(line)
            if debugger.debugger is not None:
                debugger.debugger.dump()
            if new_line is not None:
                self.updateAllStructures(list(new_line.completely_resolved))
                return new_recruits + 1
        return new_recruits

    # input: a collection of segments that had reads recruited to.
    def updateAllStructures(self, interesting_segments):
        # type: (Iterable[Segment]) -> None
        interesting_segments = list(interesting_segments)
        sys.stdout.trace("Updating structures:", interesting_segments)
        # Correct contig sequences, update correct segment storages. Return segments that were corrected.
        corrected = self.correctSequences(interesting_segments)
        # Collect all relevant contig segments, collect all reads that align to relevant segments.
        # Mark resolved bound for each read.
        sys.stdout.trace("Expanding resolved segments:")
        records = self.collectRecords(
            corrected)  # type: List[LineExtender.Record]
        for rec in records:
            sys.stdout.trace("Record:", rec.line, rec.correct, rec.resolved)
            sys.stdout.trace("Reads from record:")
            for al in rec:
                sys.stdout.trace(al, al.seg_from.contig.alignments)
            sys.stdout.trace(rec.reads)
            sys.stdout.trace(rec.potential_good)
        # Update resolved segments on all relevant contig positions
        self.updateResolved(records)

    def updateResolved(self, records):
        # type: (List[LineExtender.Record]) -> None
        ok = True
        while ok:
            sys.stdout.trace("Good reads:")
            rec = records[0]  # type: LineExtender.Record
            for read_name in rec.good_reads:
                sys.stdout.trace(read_name, rec.read_bounds[read_name])
            ok = False
            for rec in records:
                if self.attemptProlongResolved(rec):
                    sys.stdout.trace("Successfully prolonged resolved:",
                                     rec.line, rec.line.initial, rec.resolved,
                                     rec.line.completely_resolved)
                    ok = True
        for rec in records:
            line = rec.resolved.contig  # type: NewLine
            line.completely_resolved.add(rec.resolved)
            for seg in rec.old_resolved:
                line.completely_resolved.add(seg)
            line.completely_resolved.mergeSegments(params.k - 1)

    def collectRecords(self, corrected):
        # type: (List[Segment]) -> List[LineExtender.Record]
        sys.stdout.trace("Collecting records", corrected)
        read_bounds = dict()
        records = dict()  # type: Dict[Segment, LineExtender.Record]
        good_reads = set()
        for seg in corrected:
            sys.stdout.trace("Oppa initial:", seg)
            seg = seg.expandLeft(params.k)
            sys.stdout.trace("Alignments relevant for", seg,
                             list(self.dot_plot.allInter(seg)))
            for al in self.dot_plot.allInter(seg):
                seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg)
                line = al.seg_from.contig  # type:NewLine
                for seg_correct in line.correct_segments.allInter(al.seg_from):
                    for seg_resolved in line.completely_resolved.allInter(
                            seg_correct):
                        if seg_resolved in records:
                            continue
                        if seg_resolved.right == len(line):
                            next_start = len(line)
                        else:
                            next = line.completely_resolved.find(
                                line.asSegment().suffix(
                                    pos=seg_resolved.right), 1)
                            if next is None:
                                next_start = len(line)
                            else:
                                next_start = next.left
                        next_start = min(next_start, len(line) - 200)
                        focus = line.segment(
                            max(seg_resolved.left,
                                min(seg_resolved.right - params.k, seg1.left)),
                            min(seg_correct.right, next_start + params.k))
                        if self.recruiter is None:
                            als = list(line.getRelevantAlignmentsFor(focus))
                        else:
                            als = list(
                                self.recruiter.getRelevantAlignments(
                                    focus, params.k))
                        if params.check_alignments:
                            self.checkAlignments(focus, als)
                        reads = ContigStorage()
                        for al in als:
                            reads.add(al.seg_from.contig)
                        als = list(
                            self.aligner.localAlign(reads.unique(),
                                                    ContigStorage([line])))
                        final_als = []
                        sys.stdout.trace("Focus:", focus, seg_resolved)
                        sys.stdout.trace(als)
                        for al in als:
                            if al.seg_to.contig == line.rc:
                                al = al.rc
                            if al.seg_to.interSize(focus) >= params.k - 100:
                                final_als.append(al)
                        sys.stdout.trace(final_als)
                        sys.stdout.trace("Finished realignment of reads")
                        records[seg_resolved] = self.createRecord(
                            seg_resolved, next_start, seg_correct, final_als,
                            good_reads, read_bounds)
        records = list(records.values())  # type: List[LineExtender.Record]
        return records

    def correctSequences(self, interesting_segments):
        # type: (Iterable[Segment]) -> List[Segment]
        interesting_segments = list(interesting_segments)
        to_correct = []
        for seg in interesting_segments:
            line = seg.contig  # type: NewLine
            correct = line.correct_segments.find(seg)
            next = line.correct_segments.find(line.suffix(correct.right), 1)
            if next is None:
                right = len(line)
            else:
                right = min(len(line), next.left + params.k / 2)
            to_correct.append(line.segment(correct.right - params.k / 2,
                                           right))
        to_correct = sorted(to_correct,
                            key=lambda seg:
                            (basic.Normalize(seg.contig.id), seg.left))
        corrected = []
        for line_id, it in itertools.groupby(
                to_correct, key=lambda seg: basic.Normalize(
                    seg.contig.id)):  # type: NewLine, Iterable[Segment]
            it = list(it)
            line = None  # type: NewLine
            forward = SegmentStorage()
            backward = SegmentStorage()
            for seg in it:
                if seg.contig.id != line_id:
                    backward.add(seg)
                    line = seg.contig.rc
                else:
                    forward.add(seg)
                    line = seg.contig
            to_polysh = SegmentStorage()
            to_polysh.addAll(forward).addAll(backward.rc)
            to_polysh.mergeSegments()
            line.addListener(to_polysh)
            line.addListener(forward)
            line.rc.addListener(backward)
            sys.stdout.trace("Polishing:", to_polysh)
            if (not line.max_extension) and to_polysh[-1].RC().left < 200:
                l = to_polysh[-1].right
                if self.attemptExtend(line):
                    to_polysh.add(line.asSegment().suffix(pos=l))
                    forward.add(line.asSegment().suffix(pos=l))
            if (not line.rc.max_extension) and to_polysh[0].left < 200:
                l = to_polysh[0].RC().right
                if self.attemptExtend(line.rc):
                    to_polysh.rc.add(line.rc.asSegment().suffix(pos=l))
                    backward.add(line.rc.asSegment().suffix(pos=l))
            to_polysh.mergeSegments()
            forward.mergeSegments()
            backward.mergeSegments()
            line.removeListener(to_polysh)
            new_segments = self.polyshSegments(line, to_polysh)
            line.removeListener(forward)
            line.rc.removeListener(backward)
            corrected.extend(forward)
            corrected.extend(backward)
            line.updateCorrectSegments(line.asSegment())
        return corrected

    def attemptCleanResolution(self, resolved):
        # type: (Segment) -> List[Tuple[Segment, List[AlignmentPiece]]]
        # Find all lines that align to at least k nucls of resolved segment. Since this segment is resolve we get all
        sys.stdout.trace("Attempting recruitment:", resolved, resolved.contig,
                         resolved.contig.correct_segments)
        resolved = resolved.suffix(length=min(len(resolved), params.k * 2))
        sys.stdout.trace("Considering resolved subsegment:", resolved)
        line_alignments = filter(
            lambda al: len(al.seg_to) >= params.k and resolved.
            interSize(al.seg_to) > params.k - 30,
            self.dot_plot.allInter(resolved))  # type: List[AlignmentPiece]
        line_alignments = [
            al for al in line_alignments if
            (al.seg_from.right >= al.seg_from.contig.initial[0].seg_to.right +
             params.k + 20
             and al.seg_to.right >= al.seg_to.contig.initial[0].seg_to.right +
             params.k + 20) or al.isIdentical()
        ]
        sys.stdout.trace("Alternative lines:", map(str, line_alignments))
        for al in line_alignments:
            if not al.isIdentical():
                sys.stdout.trace(al)
                sys.stdout.trace("\n".join(al.asMatchingStrings()))
        line_alignments = [
            al.reduce(target=resolved) for al in line_alignments
        ]
        read_alignments = []  # type: List[Tuple[AlignmentPiece, Segment]]
        correct_segments = []
        active_segments = set()
        for ltl in line_alignments:
            line = ltl.seg_from.contig  # type: NewLine
            new_copy = line.correct_segments.find(ltl.seg_from)
            # assert new_copy is not None and new_copy.interSize(ltl.seg_from) >= max(len(ltl.seg_from) - 20, params.k), str([ltl, new_copy, str(line.correct_segments)])
            # assert new_copy is not None, str([ltl, line.correct_segments])
            if new_copy is None:
                return []
            if not new_copy.contains(ltl.seg_from):
                sys.stdout.trace(
                    "Warning: alignment of resolved segment to uncorrected segment"
                )
                sys.stdout.trace(ltl, new_copy, line.correct_segments)
            correct_segments.append(new_copy)
            if ltl.percentIdentity() > 0.95:
                active_segments.add(new_copy)
            if self.recruiter is None:
                relevant_alignments = list(
                    line.getRelevantAlignmentsFor(ltl.seg_from))
            else:
                relevant_alignments = list(
                    self.recruiter.getRelevantAlignments(
                        ltl.seg_from, params.k))
            if params.check_alignments:
                self.checkAlignments(ltl.seg_from, relevant_alignments)
            read_alignments.extend(
                zip(relevant_alignments,
                    itertools.cycle([correct_segments[-1]])))
        read_alignments = sorted(read_alignments,
                                 key=lambda al: al[0].seg_from.contig.id)
        alignments_by_read = itertools.groupby(
            read_alignments, lambda al: al[0].seg_from.contig.id)
        new_recruits = []
        sys.stdout.trace("Starting read recruitment to",
                         map(str, line_alignments))
        for name, it in alignments_by_read:
            als = list(it)  # type: List[Tuple[AlignmentPiece, Segment]]
            read = als[0][0].seg_from.contig  # type: AlignedRead
            sys.stdout.trace("Recruiting read:", read, als)
            ok = False
            for al in als:
                if al[0].seg_to.interSize(resolved) >= params.k:
                    ok = True
                    break
            if not ok:
                sys.stdout.trace("Read does not overlap with resolved",
                                 resolved)
                continue

            skip = False
            for al1 in als:
                for al2 in read.alignments:
                    if al1[0].seg_to.inter(al2.seg_to):
                        sys.stdout.trace("Read already recruited", al1, al2)
                        skip = True
                        break
                if skip:
                    break
            if skip:
                continue
            new_als = []
            for al in als:
                if not al[0].contradictingRTC(tail_size=params.bad_end_length):
                    new_als.append((self.scorer.polyshAlignment(
                        al[0], params.alignment_correction_radius), al[1]))
            if len(new_als) == 0:
                sys.stdout.warn("No noncontradicting alignments of a read")
                winner = None
                seg = None
            else:
                winner, seg = self.tournament(
                    new_als)  #type: AlignmentPiece, Segment
            if winner is None:
                sys.stdout.trace("No winner")
            else:
                sys.stdout.trace("Winner for", winner.seg_from.contig.id, ":",
                                 winner, seg)
            if winner is not None:
                if seg not in active_segments:
                    sys.stdout.trace(
                        "Winner ignored since winning segment is too different from investigated segment"
                    )
                elif winner.percentIdentity() < 0.85:
                    sys.stdout.trace(
                        "Winner ignored since it is too different from winning line"
                    )
                else:
                    line = winner.seg_to.contig  # type: NewLine
                    line.addReadAlignment(winner)
                    new_recruits.append((seg, winner))
        new_recruits = sorted(new_recruits,
                              key=lambda rec:
                              (rec[0].contig.id, rec[0].left, rec[0].right))
        sys.stdout.info("Recruited " + str(len(new_recruits)) + " new reads")
        return [(seg, [al for seg, al in it])
                for seg, it in itertools.groupby(new_recruits,
                                                 key=lambda rec: rec[0])]

    def fight(self, c1, c2):
        # type: (Tuple[AlignmentPiece, Segment], Tuple[AlignmentPiece, Segment]) -> Optional[Tuple[AlignmentPiece, Segment]]
        assert c1[0].seg_from.contig == c2[0].seg_from.contig
        s1, s2, s12 = self.scorer.scoreInCorrectSegments(
            c1[0], c1[1], c2[0], c2[1])
        if s1 is not None and s2 is not None:
            diff = abs(s1 - s2)
        else:
            diff = None
        if s12 is None:
            if s1 is None:
                winner = c2
            else:
                winner = c1
        else:
            if s12 < 25 or (s12 < 40 and abs(s1 - s2) < s12 * 0.8) or (
                    s12 < 100
                    and abs(s1 - s2) < s12 * 0.5) or abs(s1 - s2) < s12 * 0.3:
                winner = None
            elif s1 > s2:
                winner = c2
            else:
                winner = c1
        if winner is None:
            sys.stdout.trace("Fight:", c1, c2, "Comparison results:", diff,
                             s12, s1, s2, "No winner")
        else:
            sys.stdout.trace("Fight:", c1, c2, "Comparison results:", diff,
                             s12, s1, s2, "Winner:", winner)
        return winner

    def tournament(self, candidates):
        # type: (List[Tuple[AlignmentPiece, Segment]]) -> Tuple[Optional[AlignmentPiece], Optional[Segment]]
        best = None
        best_id = None
        wins = []
        for i, candidate in enumerate(candidates):
            if best is None:
                best = candidate
                best_id = i
            else:
                best = self.fight(candidate, best)
                if best is None:
                    best_id = None
                    wins = []
                elif best == candidates[best_id]:
                    wins.append(i)
                else:
                    best_id = i
                    wins = []
        if best is None:
            return None, None
        if len(candidates) > 2:
            for i, candidate in enumerate(candidates):
                if i == best_id or i in wins:
                    continue
                fight_results = self.fight(candidate, best)
                if fight_results is None or fight_results != best:
                    return None, None
        return best

    def attemptExtend(self, line):
        # type: (NewLine) -> bool
        sys.stdout.trace("Attempting to extend:", line)
        if line.knot is not None:
            sys.stdout.trace("Blocked by knot")
            return False
        relevant_reads = list(
            line.read_alignments.allInter(
                line.asSegment().suffix(length=min(params.k,
                                                   len(line) - 20))))
        sys.stdout.trace("Relevant reads for extending", relevant_reads)
        if len(relevant_reads) == 0:
            return False
        new_contig, relevant_als = self.polisher.polishEnd(relevant_reads)
        if len(new_contig) == len(line):
            return False
        assert line.seq == new_contig.prefix(len=len(line)).Seq()
        tmp = len(new_contig) - len(line)
        sys.stdout.trace("Extending", line, "for", tmp)
        line.extendRight(new_contig.suffix(pos=len(line)).Seq(), relevant_als)
        sys.stdout.info("Extended contig", line, "for", tmp)
        sys.stdout.trace("Correct:", line.correct_segments)
        sys.stdout.trace("Reads:")
        sys.stdout.trace(
            list(
                line.read_alignments.allInter(
                    line.asSegment().suffix(length=min(len(line), 2000)))))
        sys.stdout.trace("Sequence:")
        sys.stdout.trace(line.seq)
        return True

    def polyshSegments(self, line, to_polysh):
        # type: (NewLine, Iterable[Segment]) -> List[Segment]
        segs = SegmentStorage()
        corrections = AlignmentStorage()
        line.addListener(segs)
        segs.addAll(to_polysh)
        segs.mergeSegments()
        segs.sort()
        for seg in segs:
            corrections.add(
                self.polisher.polishSegment(
                    seg, list(line.read_alignments.allInter(seg))))
        line.correctSequence(list(corrections))
        line.removeListener(segs)
        return list(segs)

    def updateCorrectSegments(self, line):
        # type: (NewLine) -> None
        line.updateCorrectSegments(line.asSegment())

    class Record:
        def __init__(self, resolved, next, correct, good_reads, read_bounds):
            # type: (Segment, int, Segment, Set[str], Dict[str, int]) -> None
            self.line = resolved.contig  # type: NewLine
            self.resolved = resolved
            self.old_resolved = []
            self.next_resolved_start = next
            self.correct = correct
            self.good_reads = good_reads
            self.read_bounds = read_bounds
            self.reads = []  # type: List[AlignmentPiece]
            self.sorted = True
            self.potential_good = []

        def setResolved(self, seg):
            # type: (Segment) -> None
            if seg.interSize(self.resolved) >= params.k - 1:
                self.resolved = self.resolved.cup(seg)
            else:
                self.old_resolved.append(self.resolved)
                self.resolved = seg
            self.updateGood()

        def add(self, al):
            # type: (AlignmentPiece) -> None
            tmp = list(al.split(100))
            if len(tmp) > 1:
                al = Scorer().polyshAlignment(
                    al, params.alignment_correction_radius)
                for al1 in al.split(100):
                    self.innerAdd(al)
            else:
                self.innerAdd(al)

        def innerAdd(self, al):
            # type: (AlignmentPiece) -> None
            if al.seg_from.left < params.bad_end_length:
                self.potential_good.append(al)
            else:
                self.reads.append(al)
            read = al.seg_from.contig  # type: AlignedRead
            if read.id not in self.read_bounds:
                self.read_bounds[read.id] = len(read)
            if al.rc.seg_to.left < 50:
                self.read_bounds[read.id] = min(self.read_bounds[read.id],
                                                al.seg_from.right)
            self.sorted = False

        def addAll(self, als):
            # type: (Iterator[AlignmentPiece]) -> None
            for al in als:
                self.add(al)

        def sort(self):
            if not self.sorted:
                self.reads = sorted(self.reads, key=lambda al: -al.seg_to.left)
                self.potential_good = sorted(self.potential_good,
                                             key=lambda al: -al.seg_to.left)
                self.sorted = True

        def get(self, num=None, right=None, min_inter=0):
            # type: (int, Segment, int) -> List[AlignmentPiece]
            self.sort()
            if num is None:
                num = len(self.reads)
            if right is None:
                right = self.resolved.right
            popped = []
            res = []
            while len(res) < num and len(
                    self.reads) > 0 and self.reads[-1].seg_to.left < right:
                al = self.reads.pop()
                necessary_contig_support = min(
                    len(al.seg_from.contig), al.seg_from.left + params.k + 100)
                if al.seg_from.contig.id not in self.good_reads or necessary_contig_support > self.read_bounds[
                        al.seg_from.contig.id]:
                    popped.append(al)
                    if len(al.seg_to) >= min_inter:
                        res.append(al)
            self.reads.extend(popped[::-1])
            return res

        def __iter__(self):
            for al in self.reads[::-1]:
                necessary_contig_support = min(
                    len(al.seg_from.contig), al.seg_from.left + params.k + 100)
                if al.seg_from.contig.id not in self.good_reads or necessary_contig_support > self.read_bounds[
                        al.seg_from.contig.id]:
                    yield al

        def unsupportedAlignments(self, inter_size):
            for al in self.reads[::-1]:
                necessary_contig_support = min(
                    len(al.seg_from.contig),
                    al.seg_from.left + inter_size + 100)
                if al.seg_from.contig.id not in self.good_reads or necessary_contig_support > self.read_bounds[
                        al.seg_from.contig.id]:
                    yield al

        def updateGood(self):
            self.sort()
            while len(self.reads) > 0 and self.reads[
                    -1].seg_to.left <= self.resolved.right - params.k:
                al = self.reads.pop()
                if al.seg_to.interSize(self.resolved) >= params.k:
                    if al.seg_from.contig.id not in self.good_reads:
                        sys.stdout.trace("New good read:", al)
                        self.good_reads.add(al.seg_from.contig.id)
                else:
                    sys.stdout.trace("Read does not overlap resolved", al,
                                     self.resolved)
            while len(self.potential_good) > 0 and self.potential_good[
                    -1].seg_to.left <= self.resolved.right - params.k:
                al = self.potential_good.pop()
                if al.seg_to.interSize(self.resolved) >= params.k:
                    if al.seg_from.contig.id not in self.good_reads:
                        sys.stdout.trace("New good read from potential:", al)
                        self.good_reads.add(al.seg_from.contig.id)
                else:
                    sys.stdout.trace("Read does not overlap resolved", al,
                                     self.resolved)

        def pop(self):
            return self.reads.pop()

        def __str__(self):
            return str([
                self.resolved, self.correct, self.next_resolved_start,
                self.reads
            ])

    def createRecord(self, resolved, next_start, correct, als, good_reads,
                     read_bounds):
        # type: (Segment, int, Segment, List[AlignmentPiece], Set[str], Dict[str, int]) -> Record
        line = resolved.contig  # type: NewLine
        focus = line.segment(resolved.right - params.k,
                             min(correct.right, next_start + params.k))
        res = self.Record(resolved, next_start, correct, good_reads,
                          read_bounds)
        res.addAll(als)
        res.updateGood()
        return res

    def findResolvedBound(self, rec, inter_size):
        # type: (Record, int) -> int
        bad_reads = []
        for read in rec.unsupportedAlignments(inter_size):
            if len(read.seg_to) >= inter_size:
                bad_reads.append(read)
            if len(bad_reads) >= params.min_contra_for_break:
                if bad_reads[-1].seg_to.left - bad_reads[0].seg_to.left > 50:
                    bad_reads = bad_reads[1:]
                else:
                    break
        if len(bad_reads) < params.min_contra_for_break:
            sys.stdout.trace("No resolved bound for", rec.resolved)
            return len(rec.line)
        else:
            sys.stdout.trace("Resolved bound for", rec.resolved, ":",
                             bad_reads[0].seg_to.left)
            sys.stdout.trace("Bound caused by read alignments:",
                             map(str, bad_reads))
            return bad_reads[0].seg_to.left

    def attemptProlongResolved(self, rec):
        # type: (Record) -> bool
        sys.stdout.trace("Working on prolonging", rec.resolved)
        res = self.findAndFilterResolvedBound(rec, params.k)
        if res <= rec.resolved.right:
            sys.stdout.trace("No luck with", rec.resolved,
                             rec.line.correct_segments)
            return False
        sys.stdout.trace("Prolonged", rec.resolved, "to", res)
        rec.setResolved(rec.resolved.contig.segment(rec.resolved.left, res))
        return True

    def findAndFilterResolvedBound(self, rec, sz):
        bound0 = self.findResolvedBound(rec, sz) + params.k * 9 / 10
        bound = min(rec.correct.right, rec.next_resolved_start + sz - 1,
                    bound0)
        res = rec.resolved.right
        if bound > rec.resolved.right:
            sys.stdout.trace("Checking resolved bound against known copies")
            candidates = self.segmentsWithGoodCopies(
                rec.resolved,
                rec.line.segment(max(0, rec.resolved.right - sz), bound), sz)
            sys.stdout.trace("Candidates:", candidates)
            for candidate in candidates:
                if candidate.left == max(
                        0, rec.resolved.right -
                        sz) and candidate.right > rec.resolved.right:
                    res = candidate.right
        sys.stdout.trace("Final resolved bound for", rec.resolved, " and k =",
                         sz, ":", res)
        return res

    def attemptJump(self, rec):
        # type: (Record) -> bool
        bound = self.findAndFilterResolvedBound(rec, params.l)
        bad_segments = SegmentStorage()
        for al in rec:
            if al.seg_to.left > bound:
                break
            if al.seg_from.left > min(params.bad_end_length, params.k / 2) and \
                    al.rc.seg_from.left > min(params.bad_end_length, params.k / 2):
                bad_segments.add(al.seg_to)
        for al in self.dot_plot.allInter(
                rec.line.segment(rec.resolved.right - params.k, bound)):
            if al.seg_from.left > min(params.bad_end_length, params.k / 2):
                if al.rc.seg_from.left > min(params.bad_end_length,
                                             params.k / 2):
                    bad_segments.add(al.seg_to)
        bad_segments.mergeSegments(params.k - 200)
        sys.stdout.trace("Bad segments:", bad_segments)
        good_segments = bad_segments.reverse(rec.line, params.k - 100).reduce(
            rec.line.segment(rec.resolved.right - params.k, bound))
        for seg in good_segments:
            seg = Segment(seg.contig, max(0, seg.left), seg.right)
            for seg1 in self.segmentsWithGoodCopies(rec.resolved, seg,
                                                    params.k):
                if len(seg1) >= params.k and seg1.right > rec.resolved.right:
                    rec.setResolved(seg1)
                    return True
        return False

    def segmentsWithGoodCopies(self, resolved, seg, inter_size):
        # type: (Segment, Segment, int) -> List[Segment]
        als = [
            al for al in self.dot_plot.allInter(seg) if al.seg_from.left > 20
            or al.rc.seg_to.left > 20 or al.isIdentical()
        ]
        segs = SegmentStorage()
        for al in als:
            line = al.seg_from.contig  # type: NewLine
            if len(al.seg_to
                   ) >= inter_size and al.seg_from.right > line.initial[
                       0].seg_to.left:
                cap = al.seg_from.cap(
                    line.suffix(pos=line.initial[0].seg_to.left))
                incorrect = line.correct_segments.reverse(
                    line, inter_size - 1).reduce(cap)
                matching = al.matchingSequence()
                sys.stdout.trace("Incorrect: ", line, cap, incorrect)
                for seg1 in incorrect:
                    seg2 = matching.mapSegDown(seg.contig, seg1, mapIn=False)
                    sys.stdout.trace(
                        "Relevant unpolished k-mer segment alignment:", seg1,
                        seg2)
                    segs.add(seg2)
                if al.rc.seg_from.left < 50 and al.seg_to.right >= resolved.right - 100:
                    segs.add(
                        al.seg_to.contig.suffix(
                            pos=al.seg_to.right).expand(inter_size + 100))
                    sys.stdout.trace(
                        "Incoming line:", resolved, seg, al,
                        al.seg_to.contig.suffix(
                            pos=al.seg_to.right).expand(inter_size + 100))
        segs.mergeSegments(inter_size - 1)
        return list(
            segs.reverse(seg.contig, inter_size - 1 -
                         min(100, inter_size / 10)).reduce(seg))
コード例 #10
0
sys.path.append("py")

from common import basic, params
from common.basic import CreateLog

from alignment.align_tools import Aligner, DirDistributor
from common.line_align import Scorer
from common.sequences import ContigStorage

if __name__ == "__main__":
    basic.ensure_dir_existance(sys.argv[1])
    CreateLog(sys.argv[1])
    reads = ContigStorage().loadFromFile(sys.argv[2])
    contigs = ContigStorage().loadFromFile(sys.argv[3])
    scorer = Scorer()
    dd = DirDistributor(sys.argv[1])
    aligner = Aligner(dd)
    for read in reads.unique():
        print "Processing read", read
        als = [
            scorer.polyshAlignment(al, params.alignment_correction_radius)
            for al in aligner.localAlign([read], contigs)
        ]
        for al1 in als:
            for al2 in als:
                if al1.seg_to.contig == al2.seg_to.contig:
                    continue
                print al1, "vs", al2
                scorer.scoreInCorrectSegments(al1,
                                              al1.seg_to.contig.asSegment(),
コード例 #11
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"
コード例 #12
0
 def __init__(self):
     self.aligner = None  # type: Aligner
     self.scorer = Scorer()
コード例 #13
0
 def __init__(self, seq_from, seq_to, alignments):
     # type: (Contig, Contig, List[AlignmentPiece]) -> None
     self.seq_from = seq_from
     self.seq_to = seq_to
     self.alignments = alignments
     self.scorer = Scorer()
コード例 #14
0
class Correction:
    def __init__(self, seq_from, seq_to, alignments):
        # type: (Contig, Contig, List[AlignmentPiece]) -> None
        self.seq_from = seq_from
        self.seq_to = seq_to
        self.alignments = alignments
        self.scorer = Scorer()

    def changeQT(self, contig_from, contig_to):
        self.alignments = [
            al.changeTargetContig(contig_to).changeQueryContig(contig_from)
            for al in self.alignments
        ]

    def isSubstantial(self):
        # type: () -> bool
        for al in self.alignments:
            if len(list(al.splitRef())) > 1 or (
                    len(al) > 150 and al.percentIdentity() < 0.98
            ) or (len(al.seg_from) - len(al.seg_to)) > 50:
                return True
        return False

    def mapSegmentsUp(self, segments):
        # type: (List[Segment]) -> List[Segment]
        left = self.mapPositionsUp([seg.left for seg in segments])
        right = self.mapPositionsUp([seg.right - 1 for seg in segments])
        return [Segment(self.seq_from, l, r + 1) for l, r in zip(left, right)]

    def mapSegmentsDown(self, segments):
        # type: (Iterable[Segment]) -> List[Segment]
        segments = list(segments)
        left = self.mapPositionsDown([seg.left for seg in segments])
        right = self.mapPositionsDown([seg.right - 1 for seg in segments])
        return [Segment(self.seq_to, l, r + 1) for l, r in zip(left, right)]

    def mapPositionsUp(self, positions, none_on_miss=False):
        # type: (List[int], bool) -> List[Optional[int]]
        tmp = [(pos, i) for i, pos in enumerate(positions)]
        tmp = sorted(tmp)
        res = [0] * len(positions)
        cur_pos = 0
        for al in self.alignments:
            while cur_pos < len(tmp) and tmp[cur_pos][0] <= al.seg_to.left:
                res[tmp[cur_pos][1]] = al.seg_from.left - (al.seg_to.left -
                                                           tmp[cur_pos][0])
                cur_pos += 1
            for p1, p2 in al.matchingPositions(equalOnly=True):
                while cur_pos < len(positions) and tmp[cur_pos][0] <= p2:
                    if tmp[cur_pos][0] == p2 or not none_on_miss:
                        res[tmp[cur_pos][1]] = p1
                    else:
                        res[tmp[cur_pos][1]] = None
                    cur_pos += 1
        while cur_pos < len(positions):
            res[tmp[cur_pos][1]] = len(
                self.seq_from) - (len(self.seq_to) - tmp[cur_pos][0])
            cur_pos += 1
        return res

    def mapPositionsDown(self, positions, none_one_miss=False):
        # type: (List[int], bool) -> List[Optional[int]]
        tmp = [(pos, i) for i, pos in enumerate(positions)]
        tmp = sorted(tmp)
        res = [0] * len(positions)
        cur_pos = 0
        for al in self.alignments:
            while cur_pos < len(tmp) and tmp[cur_pos][0] <= al.seg_from.left:
                res[tmp[cur_pos][1]] = al.seg_to.left - (al.seg_from.left -
                                                         tmp[cur_pos][0])
                cur_pos += 1
            for p1, p2 in al.matchingPositions(equalOnly=False):
                while cur_pos < len(positions) and tmp[cur_pos][0] <= p1:
                    if tmp[cur_pos][0] == p1 or not none_one_miss:
                        res[tmp[cur_pos][1]] = p2
                    else:
                        res[tmp[cur_pos][1]] = None
                    cur_pos += 1
        while cur_pos < len(positions):
            res[tmp[cur_pos][1]] = len(
                self.seq_to) - (len(self.seq_from) - tmp[cur_pos][0])
            cur_pos += 1
        return res

    def continuousMapping(self, map_function, iter):
        # type: (Callable[[List[int]], List[int]], Iterator[int]) -> Generator[int]
        chunk = []
        for item in iter:
            chunk.append(item)
            if len(chunk) > 100000:
                for res in map_function(chunk):
                    yield res
                chunk = []
        for res in map_function(chunk):
            yield res

    # This method may change the order of alignments. But they will be sorted by start.
    def composeQueryDifferences(self, als):
        # type: (List[AlignmentPiece]) -> List[AlignmentPiece]
        order = sorted(range(len(als)), key=lambda i: als[i].seg_to.left)
        # Sorting alignments into those that intersect corrections (complex) and those that do not (easy)
        easy = []  # type: List[int]
        complex = []  # type: List[int]
        cur = 0
        for al in self.alignments:
            while cur < len(als) and als[
                    order[cur]].seg_to.left < al.seg_to.left:
                if als[order[cur]].seg_to.right >= al.seg_to.left:
                    complex.append(order[cur])
                else:
                    easy.append(order[cur])
                cur += 1
            while cur < len(als) and als[
                    order[cur]].seg_to.left < al.seg_to.right:
                complex.append(order[cur])
                cur += 1
        while cur < len(als):
            easy.append(order[cur])
            cur += 1

        res = [None] * len(als)  # type: List[AlignmentPiece]
        # Mapping alignments that do not intersect corrections
        new_easy_segs = self.mapSegmentsUp([als[i].seg_to for i in easy])
        for seg, i in zip(new_easy_segs, easy):
            res[i] = als[i].changeTargetSegment(seg)
        # Mapping alignments that intersect corrections
        func = lambda items: self.mapPositionsUp(items, True)
        matchings = [als[i].matchingSequence(True) for i in complex]
        positions = map(lambda matching: map(lambda pair: pair[1], matching),
                        matchings)
        generator = self.continuousMapping(
            func, itertools.chain.from_iterable(positions))
        for i, matching in zip(complex, matchings):
            al = als[i]
            new_pairs = []
            for pos_from, pos_to in matching.matches:
                new_pos = generator.next()
                if new_pos is not None:
                    new_pairs.append((pos_from, new_pos))
            new_matching = MatchingSequence(matching.seq_from,
                                            self.seq_from.seq, new_pairs)
            corrected_matching = self.scorer.polyshMatching(
                new_matching, params.alignment_correction_radius)
            res[i] = corrected_matching.asAlignmentPiece(
                al.seg_from.contig, self.seq_from)
        return res

    @staticmethod
    def constructCorrection(alignments):
        # type: (List[AlignmentPiece]) -> Correction
        initial = alignments[0].seg_to.contig
        alignments = sorted(alignments, key=lambda al: al.seg_to.left)
        sb = []
        pos = initial.left()
        new_pos = 0
        for al in alignments:
            sb.append(initial.subSequence(pos, al.seg_to.left).seq)
            new_pos += al.seg_to.left - pos
            pos = al.seg_to.left
            sb.append(al.seg_from.Seq())
            new_pos += al.seg_from.__len__()
            pos = al.seg_to.right
        sb.append(
            initial.segment(alignments[-1].seg_to.right,
                            initial.right()).Seq())
        new_pos += initial.right() - alignments[-1].seg_to.right
        new_seq = Contig("".join(sb), "TMP1_" + initial.id)
        new_als = []
        pos = initial.left()
        new_pos = 0
        for al in alignments:
            new_pos += al.seg_to.left - pos
            new_seg_from = Segment(new_seq, new_pos,
                                   new_pos + al.seg_from.__len__())
            new_als.append(al.changeQuerySegment(new_seg_from))
            pos = al.seg_to.right
            new_pos += al.seg_from.__len__()
        return Correction(new_seq, initial, new_als)
コード例 #15
0
ファイル: knotter.py プロジェクト: AntonBankevich/mosaic
 def tryMergeRight(self, line):
     # type: (NewLine) -> Optional[NewLine]
     assert line.read_alignments.checkLine(line), str(line.read_alignments)
     if line.circular or line.knot is not None:
         return None
     read_alignments = line.read_alignments.allInter(
         line.asSegment().suffix(length=1000))
     candidates = []  # type: List[LineMerger.Record]
     for al1 in read_alignments:
         read = al1.seg_from.contig  # type: AlignedRead
         if al1.contradictingRTC():
             continue
         for al2 in read.alignments:
             if al2.contradictingRTC() or (al1.canMergeTo(al2)
                                           and al1.deepInter(al2)):
                 continue
             new_rec = self.Record(al1, al2)
             if len(line) + new_rec.other.initial[
                     0].seg_to.left + new_rec.gap > line.initial[
                         0].seg_to.left:
                 candidates.append(new_rec)
     #             (al2.seg_from.contig, gap, read, al1, al2)
     candidates = sorted(candidates, key=lambda rec: rec.other.id)
     final_candidates = []
     for other_line, iter in itertools.groupby(
             candidates, lambda rec: rec.other
     ):  # type: NewLine, Iterator[LineMerger.Record]
         recs = list(iter)
         if recs[-1].gap - recs[0].gap > min(100, abs(recs[-1].gap) / 8):
             sys.stdout.trace("\n".join(map(str, candidates)))
             sys.stdout.warn("WARNING: Ambiguous knotting to the same line")
             if len(recs) >= 5 and recs[-2].gap - recs[1].gap < min(
                     10,
                     abs(recs[-2].gap) / 10):
                 recs = recs[1:-1]
             else:
                 return None
             # assert False, "Ambiguous knotting to the same line" + str(recs[0])
         avg = sum([rec.gap for rec in recs]) / len(recs)
         avg_initial = sum([rec.initial_gap for rec in recs]) / len(recs)
         if recs[0].other == line.rc:
             sys.stdout.warn(
                 "WARNING: Ignoring connection to reverse-compliment line")
             continue
         final_candidates.append((avg, recs[0].other, recs, avg_initial))
     if len(final_candidates) == 0:
         return None
     final_candidates = sorted(final_candidates,
                               key=lambda candidate: candidate[-1])
     final = final_candidates[0]
     if len(final_candidates) > 1:
         sys.stdout.warn("Extra candidates")
         sys.stdout.trace("\n".join(map(str, candidates)))
     # for candidate in final_candidates[1:]:
     #     if final[0] + len(final[1]) > candidate[0]:
     #         print "\n".join(map(str, candidates))
     #         assert False, "Contradicting candidates" + str(final[0]) + " " + str(final[1]) + " " + str(candidate[0]) + " " + str(candidate[1])
     if final[0] > 0:
         sys.stdout.trace("Positive gap. Can not merge line", line)
         sys.stdout.trace(final)
         return None
     elif len(final[2]) <= 1:
         sys.stdout.trace("Insufficient support to merge line", line)
         sys.stdout.trace(final)
         return None
     else:
         sys.stdout.info("Merging", line, "with", final[1], "with gap",
                         final[0])
         sys.stdout.trace("Alignments:")
         sys.stdout.trace("\n".join(map(str, final[2])))
         other = final[1]
         assert line != other.rc
         assert other.rc.knot is None
         line_alignment = final[2][0].al1.composeTargetDifference(
             final[2][0].al2)
         sys.stdout.trace("Alignment:", line_alignment)
         sys.stdout.trace("\n".join(line_alignment.asMatchingStrings()))
         sys.stdout.trace(line_alignment.cigar)
         sys.stdout.trace(
             list(self.dot_plot.getAlignmentsToFrom(other, line)))
         tmp = None
         if final[0] < -params.k - 100:
             for al in self.dot_plot.getAlignmentsToFrom(other, line):
                 if len(
                         list(al.matchingSequence().common(
                             line_alignment.matchingSequence()))) > 0:
                     if tmp is None or len(tmp) < len(al):
                         tmp = al
             if tmp is None:
                 sys.stdout.warn(
                     "No good line alignment found. Alignment based on reads will be used."
                 )
             else:
                 sys.stdout.trace("Switched to line alignment:", tmp)
             if (tmp.seg_to.left < 20 and tmp.rc.seg_to.left < 20) or \
                     (tmp.seg_from.left < 20 and tmp.rc.seg_from.left < 20) or \
                     (tmp.seg_from.left < 20 and tmp.rc.seg_to.left < 20):
                 sys.stdout.warn("One line is substring of another.",
                                 str(line_alignment) + " " + str(tmp))
             elif tmp.seg_to.left > 30 or tmp.rc.seg_from.left > 30:
                 sys.stdout.warn("Line alignment is not overlap!", tmp)
                 if params.strict_merging_alignment:
                     assert tmp.seg_to.left < 30 and tmp.rc.seg_from.left < 30, str(
                         line_alignment) + " " + str(tmp)
             line_alignment = tmp
         pref = line_alignment.seg_from.left
         suff = len(
             line_alignment.seg_to.contig) - line_alignment.seg_to.right
         line_alignment = Scorer().polyshAlignment(
             line_alignment, params.alignment_correction_radius)
         sys.stdout.trace("Polished alignment:", line_alignment)
         sys.stdout.trace("\n".join(line_alignment.asMatchingStrings()))
         sys.stdout.trace(line_alignment.cigar)
         if line == other:
             gap = -line_alignment.rc.seg_from.right - line_alignment.seg_to.left + line.correct_segments[
                 0].left + line.rc.correct_segments[0].left
             if gap > 0:
                 sys.stdout.trace(
                     "Line is circular but not ready for completion. Skipping."
                 )
                 return None
             line.cutRight(line.correct_segments[-1].right)
             line.rc.cutRight(line.rc.correct_segments[-1].right)
             line.tie(line, gap, "")
             sys.stdout.info(line, "is circular")
             return line
         new_line = self.storage.mergeLines(line_alignment, params.k)
         seg = new_line.segment(pref, len(new_line) - suff)
         correction = self.polisher.polishSegment(
             seg, list(new_line.read_alignments.allInter(seg)))
         new_line.correctSequence([correction])
         new_line.updateCorrectSegments(
             new_line.segment(pref,
                              len(new_line) - suff).expand(100))
         return new_line
コード例 #16
0
def main(model_file, k, dir, contigs_file, reads_file):
    # type: (str, int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.scores = ComplexScores()
    params.scores.load(open(model, "r"))
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        cnt = 0
        al0 = None
        others = []
        for al in read.alignments:
            if not al.contradictingRTC():
                cnt += 1
                al0 = al
            else:
                others.append(al)
        if cnt != 1 or len(others) == 0:
            continue
        print al0
        print others
        seg = al0.seg_from
        for al in others:
            if al.seg_from.interSize(seg) < k:
                seg = None
                break
            else:
                seg = al.seg_from.cap(seg)
        print seg
        if seg is None:
            continue
        al0 = al0.reduce(query=seg)
        others = [al.reduce(query=seg) for al in others]
        scorer = Scorer(params.scores)
        for al in others:
            a, b, c = scorer.scoreCommon(al0, al)
            print "win", a, b, c, len(seg)
        if len(seg) > 1000:
            for i in range(len(seg) / 1000):
                seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000)
                for al in others:
                    a, b, c = scorer.scoreCommon(al0.reduce(query=seg1),
                                                 al.reduce(query=seg1))
                    print "win1000", a, b, c, len(seg1)
        for al1 in others:
            for al2 in others:
                if al1 == al2:
                    continue
                a, b, c = scorer.scoreCommon(al1, al2)
                print "draw", a, b, c, len(seg)