Exemple #1
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
Exemple #2
0
class Correction:
    def __init__(self, seq_from, seq_to, alignments):
        # type: (Contig, Contig, List[AlignmentPiece]) -> None
        self.seq_from = seq_from
        self.seq_to = seq_to
        self.alignments = alignments
        self.scorer = Scorer()

    def changeQT(self, contig_from, contig_to):
        self.alignments = [
            al.changeTargetContig(contig_to).changeQueryContig(contig_from)
            for al in self.alignments
        ]

    def isSubstantial(self):
        # type: () -> bool
        for al in self.alignments:
            if len(list(al.splitRef())) > 1 or (
                    len(al) > 150 and al.percentIdentity() < 0.98
            ) or (len(al.seg_from) - len(al.seg_to)) > 50:
                return True
        return False

    def mapSegmentsUp(self, segments):
        # type: (List[Segment]) -> List[Segment]
        left = self.mapPositionsUp([seg.left for seg in segments])
        right = self.mapPositionsUp([seg.right - 1 for seg in segments])
        return [Segment(self.seq_from, l, r + 1) for l, r in zip(left, right)]

    def mapSegmentsDown(self, segments):
        # type: (Iterable[Segment]) -> List[Segment]
        segments = list(segments)
        left = self.mapPositionsDown([seg.left for seg in segments])
        right = self.mapPositionsDown([seg.right - 1 for seg in segments])
        return [Segment(self.seq_to, l, r + 1) for l, r in zip(left, right)]

    def mapPositionsUp(self, positions, none_on_miss=False):
        # type: (List[int], bool) -> List[Optional[int]]
        tmp = [(pos, i) for i, pos in enumerate(positions)]
        tmp = sorted(tmp)
        res = [0] * len(positions)
        cur_pos = 0
        for al in self.alignments:
            while cur_pos < len(tmp) and tmp[cur_pos][0] <= al.seg_to.left:
                res[tmp[cur_pos][1]] = al.seg_from.left - (al.seg_to.left -
                                                           tmp[cur_pos][0])
                cur_pos += 1
            for p1, p2 in al.matchingPositions(equalOnly=True):
                while cur_pos < len(positions) and tmp[cur_pos][0] <= p2:
                    if tmp[cur_pos][0] == p2 or not none_on_miss:
                        res[tmp[cur_pos][1]] = p1
                    else:
                        res[tmp[cur_pos][1]] = None
                    cur_pos += 1
        while cur_pos < len(positions):
            res[tmp[cur_pos][1]] = len(
                self.seq_from) - (len(self.seq_to) - tmp[cur_pos][0])
            cur_pos += 1
        return res

    def mapPositionsDown(self, positions, none_one_miss=False):
        # type: (List[int], bool) -> List[Optional[int]]
        tmp = [(pos, i) for i, pos in enumerate(positions)]
        tmp = sorted(tmp)
        res = [0] * len(positions)
        cur_pos = 0
        for al in self.alignments:
            while cur_pos < len(tmp) and tmp[cur_pos][0] <= al.seg_from.left:
                res[tmp[cur_pos][1]] = al.seg_to.left - (al.seg_from.left -
                                                         tmp[cur_pos][0])
                cur_pos += 1
            for p1, p2 in al.matchingPositions(equalOnly=False):
                while cur_pos < len(positions) and tmp[cur_pos][0] <= p1:
                    if tmp[cur_pos][0] == p1 or not none_one_miss:
                        res[tmp[cur_pos][1]] = p2
                    else:
                        res[tmp[cur_pos][1]] = None
                    cur_pos += 1
        while cur_pos < len(positions):
            res[tmp[cur_pos][1]] = len(
                self.seq_to) - (len(self.seq_from) - tmp[cur_pos][0])
            cur_pos += 1
        return res

    def continuousMapping(self, map_function, iter):
        # type: (Callable[[List[int]], List[int]], Iterator[int]) -> Generator[int]
        chunk = []
        for item in iter:
            chunk.append(item)
            if len(chunk) > 100000:
                for res in map_function(chunk):
                    yield res
                chunk = []
        for res in map_function(chunk):
            yield res

    # This method may change the order of alignments. But they will be sorted by start.
    def composeQueryDifferences(self, als):
        # type: (List[AlignmentPiece]) -> List[AlignmentPiece]
        order = sorted(range(len(als)), key=lambda i: als[i].seg_to.left)
        # Sorting alignments into those that intersect corrections (complex) and those that do not (easy)
        easy = []  # type: List[int]
        complex = []  # type: List[int]
        cur = 0
        for al in self.alignments:
            while cur < len(als) and als[
                    order[cur]].seg_to.left < al.seg_to.left:
                if als[order[cur]].seg_to.right >= al.seg_to.left:
                    complex.append(order[cur])
                else:
                    easy.append(order[cur])
                cur += 1
            while cur < len(als) and als[
                    order[cur]].seg_to.left < al.seg_to.right:
                complex.append(order[cur])
                cur += 1
        while cur < len(als):
            easy.append(order[cur])
            cur += 1

        res = [None] * len(als)  # type: List[AlignmentPiece]
        # Mapping alignments that do not intersect corrections
        new_easy_segs = self.mapSegmentsUp([als[i].seg_to for i in easy])
        for seg, i in zip(new_easy_segs, easy):
            res[i] = als[i].changeTargetSegment(seg)
        # Mapping alignments that intersect corrections
        func = lambda items: self.mapPositionsUp(items, True)
        matchings = [als[i].matchingSequence(True) for i in complex]
        positions = map(lambda matching: map(lambda pair: pair[1], matching),
                        matchings)
        generator = self.continuousMapping(
            func, itertools.chain.from_iterable(positions))
        for i, matching in zip(complex, matchings):
            al = als[i]
            new_pairs = []
            for pos_from, pos_to in matching.matches:
                new_pos = generator.next()
                if new_pos is not None:
                    new_pairs.append((pos_from, new_pos))
            new_matching = MatchingSequence(matching.seq_from,
                                            self.seq_from.seq, new_pairs)
            corrected_matching = self.scorer.polyshMatching(
                new_matching, params.alignment_correction_radius)
            res[i] = corrected_matching.asAlignmentPiece(
                al.seg_from.contig, self.seq_from)
        return res

    @staticmethod
    def constructCorrection(alignments):
        # type: (List[AlignmentPiece]) -> Correction
        initial = alignments[0].seg_to.contig
        alignments = sorted(alignments, key=lambda al: al.seg_to.left)
        sb = []
        pos = initial.left()
        new_pos = 0
        for al in alignments:
            sb.append(initial.subSequence(pos, al.seg_to.left).seq)
            new_pos += al.seg_to.left - pos
            pos = al.seg_to.left
            sb.append(al.seg_from.Seq())
            new_pos += al.seg_from.__len__()
            pos = al.seg_to.right
        sb.append(
            initial.segment(alignments[-1].seg_to.right,
                            initial.right()).Seq())
        new_pos += initial.right() - alignments[-1].seg_to.right
        new_seq = Contig("".join(sb), "TMP1_" + initial.id)
        new_als = []
        pos = initial.left()
        new_pos = 0
        for al in alignments:
            new_pos += al.seg_to.left - pos
            new_seg_from = Segment(new_seq, new_pos,
                                   new_pos + al.seg_from.__len__())
            new_als.append(al.changeQuerySegment(new_seg_from))
            pos = al.seg_to.right
            new_pos += al.seg_from.__len__()
        return Correction(new_seq, initial, new_als)