Esempio n. 1
0
 def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
    def _classification(self, sequence_x, ann_x, sequence_y, ann_y):
        def state(i):
            if sequence_x[i] == '-' and sequence_y[i] == '-':
                return 0
            if sequence_x[i] == '-':
                return 2
            if sequence_y[i] == '-':
                return 1
            return 0

        def get_pos():
            def state(i):
                if sequence_x[i] == '-' and sequence_y[i] == '-':
                    return -1
                if sequence_x[i] == '-':
                    return 2
                if sequence_y[i] == '-':
                    return 1
                return 0

            pos_x, pos_y = 0, 0
            pos = list()
            for i in xrange(len(sequence_x)):
                pos.append((pos_x, pos_y))
                s = state(i)
                if s == 0:
                    pos_x += 1
                    pos_y += 1
                if s == 1:
                    pos_x += 1
                if s == 2:
                    pos_y += 1

            return pos

        assert len(sequence_y) == len(sequence_x)
        l = len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        positions = get_pos()

        ret_match = (
            0 for _ in filter(lambda x: state(x) == 0, xrange(l))
        )

        ret_insertX = self.clf.multi_prepare_predict(
            (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y)
            for pos in filter(lambda x: state(x) == 1, xrange(l))
        )

        ret_insertY = self.clf.multi_prepare_predict(
            (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x)
            for pos in filter(lambda x: state(x) == 2, xrange(l))
        )

        ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY)
        return ret
Esempio n. 3
0
 def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(
         fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
Esempio n. 4
0
    def _classification(self, sequence_x, ann_x, sequence_y, ann_y):
        def state(i):
            if sequence_x[i] == '-' and sequence_y[i] == '-':
                return 0
            if sequence_x[i] == '-':
                return 2
            if sequence_y[i] == '-':
                return 1
            return 0

        def get_pos():
            def state(i):
                if sequence_x[i] == '-' and sequence_y[i] == '-':
                    return -1
                if sequence_x[i] == '-':
                    return 2
                if sequence_y[i] == '-':
                    return 1
                return 0

            pos_x, pos_y = 0, 0
            pos = list()
            for i in xrange(len(sequence_x)):
                pos.append((pos_x, pos_y))
                s = state(i)
                if s == 0:
                    pos_x += 1
                    pos_y += 1
                if s == 1:
                    pos_x += 1
                if s == 2:
                    pos_y += 1

            return pos

        assert len(sequence_y) == len(sequence_x)
        l = len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        positions = get_pos()

        ret_match = (0 for _ in filter(lambda x: state(x) == 0, xrange(l)))

        ret_insertX = self.clf.multi_prepare_predict(
            (sequence_xs, positions[pos][0], ann_x, sequence_ys,
             positions[pos][1], ann_y)
            for pos in filter(lambda x: state(x) == 1, xrange(l)))

        ret_insertY = self.clf.multi_prepare_predict(
            (sequence_ys, positions[pos][1], ann_y, sequence_xs,
             positions[pos][0], ann_x)
            for pos in filter(lambda x: state(x) == 2, xrange(l)))

        ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX,
                    ret_insertY)
        return ret
Esempio n. 5
0
def main(input_files, output_file):
    global width
    alignments = [
        list(
            Fasta.load(name, '', Alignment,
                       ['^sequence1', '^sequence2', '^[av].*']))
        if os.path.exists(name) else None for name in input_files
    ]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50),
                  (255, 255, 255))
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255),
              (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs()
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift)
                    for x, y, _ in coords],
                   fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4,
                                  y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4,
                                  y * width + y_shift + width / 4)],
                                outline=colors[i])
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)
Esempio n. 6
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        """Takes sequences with spaces and prepares training data for classifier
        """
        assert len(sequence_y) == len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
        )

        train_data0 = self.prepare_negative_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
            matched_pos,
            seq_size,
            weights_set,
        )

        return train_data1[0] + train_data0[0],\
            train_data1[1] + train_data0[1],\
            train_data1[2] + train_data0[2]
Esempio n. 7
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        """Takes sequences with spaces and prepares training data for classifier
        """
        assert len(sequence_y) == len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
        )

        train_data0 = self.prepare_negative_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
            matched_pos,
            seq_size,
            weights_set,
        )

        return train_data1[0] + train_data0[0],\
            train_data1[1] + train_data0[1],\
            train_data1[2] + train_data0[2]
Esempio n. 8
0
def main(input_files, output_file):
    global width
    alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) 
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs() 
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift) for 
                    x, y, _ in coords], fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) 
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)
Esempio n. 9
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        train_data = (list(), list(), list())

        if self.insert_sequence == 0:
            reference = sequence_x
            annotations_r = annotations_x
            space = sequence_y
            annotations_s = annotations_y
        else:
            reference = sequence_y
            annotations_r = annotations_y
            space = sequence_x
            annotations_s = annotations_x

        sequence_rs = Fasta.alnToSeq(reference)
        sequence_ss = Fasta.alnToSeq(space)

        pos_s, pos_r = 0, 0

        match_pos = set()
        for i in range(len(space)):
            br, bs = reference[i], space[i]
            if bs != '-':
                if br != '-':
                    match_pos.add((pos_r, pos_s))
                    pos_r += 1
                pos_s += 1
                continue
            if br == '-':
                continue

            d = self.prepare_data(
                sequence_rs,
                pos_r,
                annotations_r,
                sequence_ss,
                pos_s,
                annotations_s,
                0,
            )
            if d is not None:
                train_data[0].append(d)
                train_data[1].append(1)
                train_data[2].append(1.0)
            pos_r += 1

        matches = sample(match_pos, len(train_data[0]))
        for x, y in matches:
            d = self.prepare_data(
                sequence_rs,
                x,
                annotations_r,
                sequence_ss,
                y,
                annotations_s,
                0,
            )

            train_data[0].append(d)
            train_data[1].append(0)
            train_data[2].append(1.0)

        return train_data
Esempio n. 10
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        train_data = (list(), list(), list())

        if self.insert_sequence == 0:
            reference = sequence_x
            annotations_r = annotations_x
            space = sequence_y
            annotations_s = annotations_y
        else:
            reference = sequence_y
            annotations_r = annotations_y
            space = sequence_x
            annotations_s = annotations_x

        sequence_rs = Fasta.alnToSeq(reference)
        sequence_ss = Fasta.alnToSeq(space)

        pos_s, pos_r = 0, 0

        match_pos = set()
        for i in range(len(space)):
            br, bs = reference[i], space[i]
            if bs != '-':
                if br != '-':
                    match_pos.add((pos_r, pos_s))
                    pos_r += 1
                pos_s += 1
                continue
            if br == '-':
                continue

            d = self.prepare_data(
                sequence_rs,
                pos_r,
                annotations_r,
                sequence_ss,
                pos_s,
                annotations_s,
                0,
            )
            if d is not None:
                train_data[0].append(d)
                train_data[1].append(1)
                train_data[2].append(1.0)
            pos_r += 1

        matches = sample(match_pos, len(train_data[0]))
        for x, y in matches:
            d = self.prepare_data(
                sequence_rs,
                x,
                annotations_r,
                sequence_ss,
                y,
                annotations_s,
                0,
            )

            train_data[0].append(d)
            train_data[1].append(0)
            train_data[2].append(1.0)

        return train_data