def __init__(self, name, datasets, clog=None, fLOG=noLOG, path_to_images=".", cache_file=None, progressbar=None, graphx=None, graphy=None, **params): """ @param name name of the test @param datasets list of dictionary of dataframes @param clog see @see cl CustomLog or string @param fLOG logging function @param params extra parameters @param path_to_images path to images and intermediate results @param cache_file cache file @param progressbar relies on *tqdm*, example *tnrange* @param graphx list of variables to use as X axis @param graphy list of variables to use as Y axis If *cache_file* is specified, the class will store the results of the method :meth:`bench <pyquickhelper.benchhelper.benchmark.GridBenchMark.bench>`. On a second run, the function load the cache and run modified or new run (in *param_list*). *datasets* should be a dictionary with dataframes a values with the following keys: * ``'X'``: features * ``'Y'``: labels (optional) """ GridBenchMark.__init__(self, name=name, datasets=datasets, clog=clog, fLOG=fLOG, path_to_images=path_to_images, cache_file=cache_file, progressbar=progressbar, **params) self._xaxis = graphx self._yaxis = graphy
def preprocess_dataset(self, dsi, **params): """ Splits the dataset into train and test. @param params additional parameters @return dataset (like info), dictionary for metrics """ ds, appe, params = GridBenchMark.preprocess_dataset( self, dsi, **params) if "no_split" in ds: no_split = ds["no_split"] else: no_split = False if no_split: self.fLOG("[MlGridBenchMark.preprocess_dataset] no split") return (ds, ds), appe, params else: self.fLOG("[MlGridBenchMark.preprocess_dataset] split train test") spl = ["X", "Y", "weight", "group"] names = [_ for _ in spl if _ in ds] if len(names) == 0: raise Exception("No dataframe or matrix was found.") mats = [ds[_] for _ in names] pars = {"train_size", "test_size"} options = {k: v for k, v in params.items() if k in pars} for k in pars: if k in params: del params[k] res = train_test_split(*mats, **options) train = {} for i, n in enumerate(names): train[n] = res[i * 2] test = {} for i, n in enumerate(names): test[n] = res[i * 2 + 1] self.fLOG("[MlGridBenchMark.preprocess_dataset] done") return (train, test), appe, params