def dfs(self, X, g=2, l=1, kernel=None, show=1, name=None): length = len(self.vocab) if kernel is None: kernel = np.zeros((len(X), len(X))) n_kmers = 0 alive = self.process_node(X, g, l) if alive: if l == 0: n_kmers += 1 self.update_kernel(kernel) else: if show > 0: if name is not None: desc = "Trie DFS for " + name else: desc = "Trie DFS" erange = self.vrange(length, desc=desc) else: erange = range(length) for j in erange: Logger.indent() child = GappyTrie(label=self.vocab[j], parent=self) kernel, child_kmers, child_alive = child.dfs(X, g, l - 1, kernel=kernel, show=show - 1) if child.is_empty(): self.delete_child(child) n_kmers += child_kmers if child_alive else 0 Logger.dindent() return kernel, n_kmers, alive
def __call__(self, parameters=None, verbose=True): Logger.log(verbose, "Loading datasets...") Logger.indent() p = Parameters(parameters, self.defaultParameters) dataset = {} for nameset in ["train", "test"]: data, labels = gen_reg_data(p.n, p.m) Logger.log(verbose, "synthetic " + nameset + " data generated") dataset[nameset] = Dataset(p, data, verbose=verbose) Logger.dindent() Logger.log(verbose, "datasets loaded!\n") return [dataset]
def _dfs(self, X, k=2, m=1, kernel=None, show=1, name=None, wildcard=False, mismatch=False): assert (wildcard or mismatch) length = len(self.vocab) if kernel is None: kernel = np.zeros((len(X), len(X))) n_kmers = 0 alive = self.process_node(X, k, m) if alive: if k == 0: n_kmers += 1 self.update_kernel(kernel) else: if show > 0: if name is not None: desc = "Trie DFS for " + name else: desc = "Trie DFS" erange = self.vrange(length, desc=desc) else: erange = range(length) for j in erange: Logger.indent() if wildcard: child = WildcardTrie(la=self.la, label=self.vocab[j], parent=self) elif mismatch: child = MismatchTrie(label=self.vocab[j], parent=self) kernel, child_kmers, child_alive = child.dfs(X, k - 1, m, kernel=kernel, show=show - 1) if child.is_empty(): self.delete_child(child) n_kmers += child_kmers if child_alive else 0 Logger.dindent() return kernel, n_kmers, alive
def __call__(self, parameters=None, verbose=True): Logger.log(verbose, "Loading datasets...") Logger.indent() p = Parameters(parameters, self.defaultParameters) dataset = {} for nameset in ["train", "test"]: data, names = load_data(nameset, k=p.k, mat=p.mat, small=p.small, nsmall=p.nsmall, givename=True) names = "(" + " and ".join(names) + ")" Logger.log(verbose, nameset + " data loaded! " + names) dataset[nameset] = Dataset(p, *data, verbose=verbose) Logger.dindent() Logger.log(verbose, "datasets loaded!\n") return [dataset]
def __call__(self, parameters=None, verbose=True): p = Parameters(parameters, self.defaultParameters) Logger.log(verbose, "Loading datasets...") Logger.indent() datasets = [] for i in range(3): p.k = i dataset = SeqData(p, verbose=verbose) datasets.append(dataset[0]) Logger.dindent() Logger.log(verbose, "datasets loaded!\n") return datasets
def sanity_check(self): mask = np.arange(self.n) np.random.shuffle(mask) preds = self.predict_array(self.data[mask][:20], binaire=False, desc="Sanity check") def form(number): return "{0:.2e}".format(number) strings = [form(pred) for pred in preds[:5]] self._log("Sanity check:") Logger.indent() self._log("Min: " + form(min(preds))) self._log("Max: " + form(max(preds))) self._log("Random values:", strings) Logger.dindent() self._log("")
def score_recall_precision(self, dataset, nsmall=None): mask = np.arange(dataset.n) np.random.shuffle(mask) if nsmall is not None: mask = mask[:nsmall] stringset = "training set ({} samples)".format(nsmall) else: stringset = "training set" t = Timer() t.start() predictions = self.predict_array(dataset.data[mask], binaire=True, desc="Computing train set score") score = Score(predictions, dataset.labels[mask]) t.stop() self._log("Results of the {} (computed in {})".format(stringset, t)) Logger.indent() self._log(score) Logger.dindent() self._log("") return score
def f(self, dataset=None, labels=None, K=None, w=None): t = Timer() self._log("Fitting {}..".format(name)) Logger.indent() t.start() if K is None: self.load_dataset(dataset, labels) if pca: K = self.kernel.KC else: K = self.kernel.K if wkrr: result = fitfunc(self, K, w) else: result = fitfunc(self, K) t.stop() Logger.dindent() self._log("Fitting done! (computed in {})\n".format(t)) return result
def fit(self, onekernel=True): self.scores = {} self.parameters_to_test = self.get_params_to_test( self.kernel_parameters, self.clf_parameters, self.parameter_grid) if onekernel: CV = CrossValidation(self.dataset, kfolds=self.kfold, verbose=True) kernel_param = self.parameters_to_test[0][0] print('Testing kernel parameters :', kernel_param) kernels = CV.constant_kernel(self.kernel, kernel_param) for j, l in enumerate(self.parameters_to_test): kernel_params = l[0] clf_params = l[1] if not onekernel: kernel_to_try = self.kernel(self.dataset, parameters=kernel_params) print('Testing kernel parameters :', kernel_params) else: kernel_to_try = kernels[0] temp_clf = self.clf(kernel_to_try, parameters=clf_params, verbose=False) print('Testing clf classifiers : ', clf_params) Logger.indent() if not onekernel: CV = CrossValidation(self.dataset, temp_clf, kfolds=self.kfold, verbose=True) else: CV.fit_K(temp_clf, kernels) Logger.log(True, CV) Logger.log(True, "") Logger.dindent() temp_report = CV.stats self.scores[j] = temp_report
def EasyTest(kernels, data="seq", methods=None, dparams=None, kparams=None, mparams=None, pcadim=3, show=False, dopredictions=False, verbose=True): Datasets = findData(data)(dparams, verbose) ndatas = len(Datasets) Kernels = find_more_or_one(kernels, findKernel, ndatas) KMethods = find_more_or_one(methods, findMethod, ndatas) Kparams = find_more_or_one(kparams, lambda x: x, ndatas) Mparams = find_more_or_one(mparams, lambda x: x, ndatas) predictions = [] Ids = [] scores = [] Logger.indent() for Dataset, Kernel, KMethod, Kparam, Mparam in zip( Datasets, Kernels, KMethods, Kparams, Mparams): Logger.dindent() Logger.log(verbose, "Experiment on:") Logger.indent() train = Dataset["train"] # train._show_gen_class_data() kernel = Kernel(train, parameters=Kparam) method = KMethod(kernel, parameters=Mparam) Logger.log(verbose, kernel) Logger.log(verbose, method) Logger.log(verbose, train) Logger.log(verbose, "") method.fit() # Logger.log(verbose, method.alpha) # Check the value to see if it is alright method.sanity_check() # Compute the score of the train set: score = method.score_recall_precision(train, nsmall=200) scores.append(score) if show: Logger.log(verbose, "Show the trainset in the feature space..") Logger.indent() kpca = KPCA(kernel, parameters={"dim": pcadim}) proj = kpca.project() predict = method.predict_array(train.data, desc="Projections") Logger.dindent() kernel.dataset.show_pca(proj, predict, dim=pcadim) if dopredictions: # Predictict on the test set and save the result test = Dataset["test"] test.labels = method.predict_array(test.data) test.transform_label() predictions.append(test.labels) Ids.append(test.Id) Logger.dindent() Logger.log(verbose, "Score remainder:") Logger.indent() [Logger.log(verbose, s) for s in scores] Logger.dindent() if dopredictions: return scores, predictions, Ids else: return scores
def KernelTest(kernelname, parameters, synth=False): Dataset = findData("allseq")()[0] if synth: import numpy as np from src.data.dataset import Dataset defaultParameters = { "k": 0, "mat": False, "shuffle": False, "small": True, "nsmall": 200, "labels_change": True, "name": "seq", "nclasses": 2 } from src.tools.utils import Parameters p = Parameters(None, defaultParameters) train = Dataset(p, np.array(['ATTA', 'AAAA']), np.array([0, 1])) else: train = Dataset["train"] Kernel = findKernel(kernelname) Logger.log(True, "Test the " + kernelname + " kernel.") Logger.indent() kernels = [] for params in parameters: Logger.log(True, "Test with these parameters: " + str(params)) Logger.indent() kernel = Kernel(train, params) kernels.append(kernel) Logger.log(True, kernel.K) Logger.dindent() # ipdb.set_trace() Logger.dindent()