def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
 def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(
         fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
def load_data(preparer):
    dl = DataLoader()
    data, target, weights = list(), list(), list()
    sequences = dl.loadDirectory('data/sequences/model_train_seq/simulated')
    # sequences = dl.loadDirectory('data/sequences/train_sequences')
    for _, s_x, a_x, s_y, a_y in sequences:
        d, t, _ = preparer.prepare_training_data(s_x, a_x, s_y, a_y)
        data += d
        target += t
    return data, target
def load_data(preparer):
    dl = DataLoader()
    data, target, weights = list(), list(), list()
    sequences = dl.loadDirectory('data/sequences/model_train_seq/simulated')
    # sequences = dl.loadDirectory('data/sequences/train_sequences')
    for _, s_x, a_x, s_y, a_y in sequences:
        d, t, _ = preparer.prepare_training_data(
            s_x, a_x, s_y, a_y
        )
        data += d
        target += t
    return data, target
Example #5
0
    def __init__(
        self,
        preparer,
        filename="data/clf/randomforest.clf",
        training_data_dir="data/sequences/train_sequences",
        params=None,
        autotrain=True,
        memoization=False,
        inverted=False,
        use_global_classifier=False,
    ):
        global _global_classifier
        """
        @rtype : PairClassifier
        """
        self._preparer = None
        self.preparer = preparer
        self.default_filename = filename
        self.training_data_dir = training_data_dir
        if params is None:
            self.params = config.classifiers[config.classifier_index][2]
        else:
            self.params = params
        self.mem = dict()
        self.memoization = memoization
        self.inverted = inverted

        if _global_classifier is None or not use_global_classifier:
            if autotrain and path.exists(self.default_filename):
                if path.isfile(self.default_filename):
                    self.load(self.default_filename)
            else:
                self.classifier = self._get_classifier()
                if autotrain:
                    sys.stderr.write('Training clasifier\n')
                    dl = DataLoader()
                    data, target, weights = list(), list(), list()
                    sequences = dl.loadDirectory(self.training_data_dir)
                    for _, s_x, a_x, s_y, a_y in sequences:
                        d, t, w = self.preparer.prepare_training_data(
                            s_x, a_x, s_y, a_y
                        )
                        data += d
                        target += t
                        weights += w
                    self.fit(data, target, array(weights))
                    self.save(self.default_filename)
            if use_global_classifier:
                _global_classifier = self.classifier
        else:
            self.classifier = _global_classifier
Example #6
0
    def __init__(
        self,
        preparer,
        filename="data/clf/randomforest.clf",
        training_data_dir="data/sequences/train_sequences",
        params=None,
        autotrain=True,
        memoization=False,
        inverted=False,
        use_global_classifier=False,
    ):
        global _global_classifier
        """
        @rtype : PairClassifier
        """
        self._preparer = None
        self.preparer = preparer
        self.default_filename = filename
        self.training_data_dir = training_data_dir
        if params is None:
            self.params = config.classifiers[config.classifier_index][2]
        else:
            self.params = params
        self.mem = dict()
        self.memoization = memoization
        self.inverted = inverted

        if _global_classifier is None or not use_global_classifier:
            if autotrain and path.exists(self.default_filename):
                if path.isfile(self.default_filename):
                    self.load(self.default_filename)
            else:
                self.classifier = self._get_classifier()
                if autotrain:
                    sys.stderr.write('Training clasifier\n')
                    dl = DataLoader()
                    data, target, weights = list(), list(), list()
                    sequences = dl.loadDirectory(self.training_data_dir)
                    for _, s_x, a_x, s_y, a_y in sequences:
                        d, t, w = self.preparer.prepare_training_data(
                            s_x, a_x, s_y, a_y)
                        data += d
                        target += t
                        weights += w
                    self.fit(data, target, array(weights))
                    self.save(self.default_filename)
            if use_global_classifier:
                _global_classifier = self.classifier
        else:
            self.classifier = _global_classifier
def compare_with_source(source_fname, realigned_fname, seq1, seq2):
    d = DataLoader()
    src = d.getSequences(source_fname, [seq1+'$', seq2+'$'])
    realigned = d.getSequences(realigned_fname, [seq1+'$', seq2+'$'])

    src_f = create_alignemnt_function(src)
    realigned_f = create_alignemnt_function(realigned)

    l = max(len(src[0]), len(realigned[0]))
    s = 0
    for i in range(min(len(src[0]), len(realigned[0]))):
        if src_f(i) == realigned_f(i):
            s += 1

    return float(s)/l
def score(fname):
    sequence_names = ['sequence1', 'sequence2', 'sequence3']
    sequences = []
    # fname = 'data/sequences/simulated_alignment.{}_{}.realigned.fa'

    d = DataLoader()
    for x in range(len(sequence_names)-1):
        for y in range(x+1, len(sequence_names)):
            sX = sequence_names[x]
            sY = sequence_names[y]
            if sX != sY:
                sequences.append(d.getSequences(
                    fname.format(sX, sY), [sX+'$', sY+'$'])
                )

    return compare(*sequences)
Example #9
0
def main(preparer_index):
    path_to_data = "data/"
    dp = config.preparers[preparer_index][0](constants.window_size)
    clf_fname = 'data/clf/{}{}{}.clf'.format(
        PairClassifier.get_name(),
        config.preparers[preparer_index][2],
        constants.window_size,
    )
    idp = config.preparers[preparer_index][1](0, constants.window_size)
    iclf_fname = 'data/clf/{}{}{}{}.clf'.format(
        PairClassifier.get_name(),
        config.preparers[preparer_index][2],
        constants.window_size,
        '_indel',
    )
    dl = DataLoader()
    # _, s_x, a_x, s_y, a_y = dl.loadSequence(
    #     path.join(path_to_data, 'sequences/train_sequences/simulated_alignment0.fa'),
    # )

    # x, y, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y)
    # ix, iy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y)

    # c.fit(x, y)
    # ic.fit(ix, iy)

    _, s_x, a_x, s_y, a_y = dl.loadSequence(
        path.join(
            path_to_data,
            'sequences/model_train_seq/simulated/simulated_alignment.fa'))
    px, py, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y)
    ipx, ipy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y)

    # plot(*compute_01graph_data(c, x, y))
    # plot(*compute_01graph_data(ic, ix, iy))

    # plot(*compute_01graph_data(c, px, py))
    # plot(*compute_01graph_data(ic, ipx, ipy))
    plot_clf(dp, px, py, clf_fname)
    plt.savefig(path.splitext(clf_fname)[0] + "_test" + pic_suffix,
                transparent=True,
                bbox_inches='tight')
    plot_clf(idp, ipx, ipy, iclf_fname)
    plt.savefig(path.splitext(iclf_fname)[0] + "_test" + pic_suffix,
                transparent=True,
                bbox_inches='tight')
    plt.show()
Example #10
0
def main(preparer_index):
    path_to_data = "data/"
    dp = config.preparers[preparer_index][0](constants.window_size)
    clf_fname = 'data/clf/{}{}{}.clf'.format(
        PairClassifier.get_name(),
        config.preparers[preparer_index][2],
        constants.window_size,
    )
    idp = config.preparers[preparer_index][1](0, constants.window_size)
    iclf_fname = 'data/clf/{}{}{}{}.clf'.format(
        PairClassifier.get_name(),
        config.preparers[preparer_index][2],
        constants.window_size,
        '_indel',
    )
    dl = DataLoader()
    # _, s_x, a_x, s_y, a_y = dl.loadSequence(
    #     path.join(path_to_data, 'sequences/train_sequences/simulated_alignment0.fa'),
    # )

    # x, y, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y)
    # ix, iy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y)

    # c.fit(x, y)
    # ic.fit(ix, iy)

    _, s_x, a_x, s_y, a_y = dl.loadSequence(
        path.join(path_to_data, 'sequences/model_train_seq/simulated/simulated_alignment.fa')
    )
    px, py, _ = dp.prepare_training_data(s_x, a_x, s_y, a_y)
    ipx, ipy, _ = idp.prepare_training_data(s_x, a_x, s_y, a_y)

    # plot(*compute_01graph_data(c, x, y))
    # plot(*compute_01graph_data(ic, ix, iy))

    # plot(*compute_01graph_data(c, px, py))
    # plot(*compute_01graph_data(ic, ipx, ipy))
    plot_clf(dp, px, py, clf_fname)
    plt.savefig(
        path.splitext(clf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight'
    )
    plot_clf(idp, ipx, ipy, iclf_fname)
    plt.savefig(
        path.splitext(iclf_fname)[0] + "_test" + pic_suffix, transparent=True, bbox_inches='tight'
    )
    plt.show()
Example #11
0
 def train(self, dirname):
     dl = DataLoader()
     sequences = dl.loadDirectory(dirname)
     return self.train_multi(sequences)