def dfs(self, X, g=2, l=1, kernel=None, show=1, name=None):
        length = len(self.vocab)
        if kernel is None:
            kernel = np.zeros((len(X), len(X)))
        n_kmers = 0
        alive = self.process_node(X, g, l)
        if alive:
            if l == 0:
                n_kmers += 1
                self.update_kernel(kernel)
            else:
                if show > 0:
                    if name is not None:
                        desc = "Trie DFS for " + name
                    else:
                        desc = "Trie DFS"
                    erange = self.vrange(length, desc=desc)
                else:
                    erange = range(length)
                for j in erange:
                    Logger.indent()
                    child = GappyTrie(label=self.vocab[j], parent=self)
                    kernel, child_kmers, child_alive = child.dfs(X,
                                                                 g,
                                                                 l - 1,
                                                                 kernel=kernel,
                                                                 show=show - 1)
                    if child.is_empty():
                        self.delete_child(child)
                    n_kmers += child_kmers if child_alive else 0

                    Logger.dindent()

        return kernel, n_kmers, alive
    def __call__(self, parameters=None, verbose=True):
        Logger.log(verbose, "Loading datasets...")
        Logger.indent()
        p = Parameters(parameters, self.defaultParameters)
        dataset = {}
        for nameset in ["train", "test"]:
            data, labels = gen_reg_data(p.n, p.m)
            Logger.log(verbose, "synthetic " + nameset + " data generated")
            dataset[nameset] = Dataset(p, data, verbose=verbose)

        Logger.dindent()
        Logger.log(verbose, "datasets loaded!\n")
        return [dataset]
    def _dfs(self,
             X,
             k=2,
             m=1,
             kernel=None,
             show=1,
             name=None,
             wildcard=False,
             mismatch=False):
        assert (wildcard or mismatch)
        length = len(self.vocab)
        if kernel is None:
            kernel = np.zeros((len(X), len(X)))
        n_kmers = 0
        alive = self.process_node(X, k, m)
        if alive:
            if k == 0:
                n_kmers += 1
                self.update_kernel(kernel)
            else:
                if show > 0:
                    if name is not None:
                        desc = "Trie DFS for " + name
                    else:
                        desc = "Trie DFS"
                    erange = self.vrange(length, desc=desc)
                else:
                    erange = range(length)
                for j in erange:

                    Logger.indent()
                    if wildcard:
                        child = WildcardTrie(la=self.la,
                                             label=self.vocab[j],
                                             parent=self)
                    elif mismatch:
                        child = MismatchTrie(label=self.vocab[j], parent=self)
                    kernel, child_kmers, child_alive = child.dfs(X,
                                                                 k - 1,
                                                                 m,
                                                                 kernel=kernel,
                                                                 show=show - 1)
                    if child.is_empty():
                        self.delete_child(child)
                    n_kmers += child_kmers if child_alive else 0

                    Logger.dindent()

        return kernel, n_kmers, alive
Beispiel #4
0
    def __call__(self, parameters=None, verbose=True):
        Logger.log(verbose, "Loading datasets...")
        Logger.indent()
        p = Parameters(parameters, self.defaultParameters)
        dataset = {}
        for nameset in ["train", "test"]:
            data, names = load_data(nameset,
                                    k=p.k,
                                    mat=p.mat,
                                    small=p.small,
                                    nsmall=p.nsmall,
                                    givename=True)
            names = "(" + " and ".join(names) + ")"
            Logger.log(verbose, nameset + " data loaded! " + names)

            dataset[nameset] = Dataset(p, *data, verbose=verbose)

        Logger.dindent()
        Logger.log(verbose, "datasets loaded!\n")
        return [dataset]
Beispiel #5
0
    def __call__(self, parameters=None, verbose=True):
        p = Parameters(parameters, self.defaultParameters)
        Logger.log(verbose, "Loading datasets...")
        Logger.indent()
        datasets = []
        for i in range(3):
            p.k = i
            dataset = SeqData(p, verbose=verbose)
            datasets.append(dataset[0])

        Logger.dindent()
        Logger.log(verbose, "datasets loaded!\n")
        return datasets
Beispiel #6
0
    def sanity_check(self):
        mask = np.arange(self.n)
        np.random.shuffle(mask)

        preds = self.predict_array(self.data[mask][:20],
                                   binaire=False,
                                   desc="Sanity check")

        def form(number):
            return "{0:.2e}".format(number)

        strings = [form(pred) for pred in preds[:5]]

        self._log("Sanity check:")
        Logger.indent()
        self._log("Min: " + form(min(preds)))
        self._log("Max: " + form(max(preds)))
        self._log("Random values:", strings)
        Logger.dindent()

        self._log("")
Beispiel #7
0
    def score_recall_precision(self, dataset, nsmall=None):
        mask = np.arange(dataset.n)
        np.random.shuffle(mask)
        if nsmall is not None:
            mask = mask[:nsmall]
            stringset = "training set ({} samples)".format(nsmall)
        else:
            stringset = "training set"

        t = Timer()
        t.start()
        predictions = self.predict_array(dataset.data[mask],
                                         binaire=True,
                                         desc="Computing train set score")
        score = Score(predictions, dataset.labels[mask])
        t.stop()
        self._log("Results of the {} (computed in {})".format(stringset, t))
        Logger.indent()
        self._log(score)
        Logger.dindent()
        self._log("")
        return score
Beispiel #8
0
        def f(self, dataset=None, labels=None, K=None, w=None):
            t = Timer()
            self._log("Fitting {}..".format(name))
            Logger.indent()
            t.start()

            if K is None:
                self.load_dataset(dataset, labels)
                if pca:
                    K = self.kernel.KC
                else:
                    K = self.kernel.K

            if wkrr:
                result = fitfunc(self, K, w)
            else:
                result = fitfunc(self, K)

            t.stop()
            Logger.dindent()
            self._log("Fitting done! (computed in {})\n".format(t))

            return result
Beispiel #9
0
    def fit(self, onekernel=True):
        self.scores = {}
        self.parameters_to_test = self.get_params_to_test(
            self.kernel_parameters, self.clf_parameters, self.parameter_grid)
        if onekernel:
            CV = CrossValidation(self.dataset, kfolds=self.kfold, verbose=True)
            kernel_param = self.parameters_to_test[0][0]
            print('Testing kernel parameters :', kernel_param)
            kernels = CV.constant_kernel(self.kernel, kernel_param)

        for j, l in enumerate(self.parameters_to_test):
            kernel_params = l[0]
            clf_params = l[1]
            if not onekernel:
                kernel_to_try = self.kernel(self.dataset,
                                            parameters=kernel_params)
                print('Testing kernel parameters :', kernel_params)
            else:
                kernel_to_try = kernels[0]
            temp_clf = self.clf(kernel_to_try,
                                parameters=clf_params,
                                verbose=False)
            print('Testing clf classifiers : ', clf_params)
            Logger.indent()
            if not onekernel:
                CV = CrossValidation(self.dataset,
                                     temp_clf,
                                     kfolds=self.kfold,
                                     verbose=True)
            else:
                CV.fit_K(temp_clf, kernels)
            Logger.log(True, CV)
            Logger.log(True, "")
            Logger.dindent()
            temp_report = CV.stats
            self.scores[j] = temp_report
def EasyTest(kernels,
             data="seq",
             methods=None,
             dparams=None,
             kparams=None,
             mparams=None,
             pcadim=3,
             show=False,
             dopredictions=False,
             verbose=True):

    Datasets = findData(data)(dparams, verbose)
    ndatas = len(Datasets)

    Kernels = find_more_or_one(kernels, findKernel, ndatas)
    KMethods = find_more_or_one(methods, findMethod, ndatas)
    Kparams = find_more_or_one(kparams, lambda x: x, ndatas)
    Mparams = find_more_or_one(mparams, lambda x: x, ndatas)

    predictions = []
    Ids = []
    scores = []

    Logger.indent()
    for Dataset, Kernel, KMethod, Kparam, Mparam in zip(
            Datasets, Kernels, KMethods, Kparams, Mparams):
        Logger.dindent()
        Logger.log(verbose, "Experiment on:")
        Logger.indent()

        train = Dataset["train"]
        # train._show_gen_class_data()
        kernel = Kernel(train, parameters=Kparam)
        method = KMethod(kernel, parameters=Mparam)

        Logger.log(verbose, kernel)
        Logger.log(verbose, method)
        Logger.log(verbose, train)
        Logger.log(verbose, "")

        method.fit()

        # Logger.log(verbose, method.alpha)
        # Check the value to see if it is alright
        method.sanity_check()
        # Compute the score of the train set:
        score = method.score_recall_precision(train, nsmall=200)
        scores.append(score)
        if show:
            Logger.log(verbose, "Show the trainset in the feature space..")
            Logger.indent()

            kpca = KPCA(kernel, parameters={"dim": pcadim})
            proj = kpca.project()
            predict = method.predict_array(train.data, desc="Projections")

            Logger.dindent()
            kernel.dataset.show_pca(proj, predict, dim=pcadim)

        if dopredictions:
            # Predictict on the test set and save the result
            test = Dataset["test"]
            test.labels = method.predict_array(test.data)
            test.transform_label()
            predictions.append(test.labels)

            Ids.append(test.Id)

    Logger.dindent()

    Logger.log(verbose, "Score remainder:")
    Logger.indent()
    [Logger.log(verbose, s) for s in scores]
    Logger.dindent()

    if dopredictions:
        return scores, predictions, Ids

    else:
        return scores
def KernelTest(kernelname, parameters, synth=False):
    Dataset = findData("allseq")()[0]
    if synth:
        import numpy as np
        from src.data.dataset import Dataset

        defaultParameters = {
            "k": 0,
            "mat": False,
            "shuffle": False,
            "small": True,
            "nsmall": 200,
            "labels_change": True,
            "name": "seq",
            "nclasses": 2
        }

        from src.tools.utils import Parameters
        p = Parameters(None, defaultParameters)

        train = Dataset(p, np.array(['ATTA', 'AAAA']), np.array([0, 1]))
    else:
        train = Dataset["train"]

    Kernel = findKernel(kernelname)

    Logger.log(True, "Test the " + kernelname + " kernel.")
    Logger.indent()
    kernels = []
    for params in parameters:
        Logger.log(True, "Test with these parameters: " + str(params))
        Logger.indent()
        kernel = Kernel(train, params)
        kernels.append(kernel)
        Logger.log(True, kernel.K)
        Logger.dindent()

    # ipdb.set_trace()
    Logger.dindent()