def run_demo_with_sampling(self, dts_name, model_data, features): ''' Parameters ---------- dts_name : str model_data : dict Required keys: ['model_name', 'sampler_name', 'pca_bool', 'n_estim', 'box_type'] Almost everything get_model needs features : list of int Only real values are allowed (-1 is not valid) Returns ------- (train, test) : tuple of list of dict The scores for many models equal in everything except in C for linear_svc ''' train_dicts = [] test_dicts = [] dataset = get_data(dts_name, n_ins=self.n_ins) for C in self.valores_C: model = get_model(C=C, **model_data) train_score, test_score = get_sampling_model_scores( model, dataset, features) train_score['label'] = 'C = {}'.format(C) test_score['label'] = 'C = {}'.format(C) train_dicts.append(train_score) test_dicts.append(test_score) return train_dicts, test_dicts
def run_demo_with_sampling(self, model, features): train_dicts = [] test_dicts = [] for dts_name in self.all_datasets_names: dataset = get_data(dts_name) train_score, test_score = get_sampling_model_scores( model, dataset, features) train_score['label'] = dts_name test_score['label'] = dts_name train_dicts.append(train_score) test_dicts.append(test_score) return train_dicts, test_dicts
def run_demo_with_sampling(self, dts_name, dts_size, model_data, features): ''' Gets the score of two models, (as specified in model_data) but one using first PCA and the other using first sampler Parameters ---------- dts_name : str dts_size : int model_data : dict Data needed to generate a model. Required keys: ['model_name', 'sampler_name', 'n_estim', 'box_type'] features : list of int A list with real features to test with. Values of -1 are not allowed Returns ------- (train, test) : tuple of list of dict The scores for two models specified, one first sampler, the other first pca Keys of dict: ['absi', 'ord', 'label'] ''' dataset = get_data(dts_name, n_ins=dts_size) model_first_sampler = get_model(pca_bool=True, **model_data) # TODO Esta función ya no existe, llamar a la genérica con el # parámetro indicador del orden model_first_pca = get_model_first_pca(pca_bool=True, **model_data) first_sampler_train_score, first_sampler_test_score =\ get_sampling_model_scores(model_first_sampler, dataset, features) first_sampler_train_score['label'] = 'Sampler >> PCA' first_sampler_test_score['label'] = 'Sampler >> PCA' first_pca_train_score, first_pca_test_score =\ get_sampling_model_scores(model_first_pca, dataset, features) first_pca_train_score['label'] = 'PCA >> Sampler' first_pca_test_score['label'] = 'PCA >> Sampler' train_dicts = [first_sampler_train_score, first_pca_train_score] test_dicts = [first_sampler_test_score, first_pca_test_score] return train_dicts, test_dicts
def run_demo(self, dts_name, dts_size): ''' Parameters ---------- dts_name : str Name of the dataset to test. Must be one of SUPPORTED_DATASETS Returns ------- (train_scores, test_scores) : tuple of list of dict Dicts have keys: ['absi', 'ord', 'label'], and 'absi' are valid numbers, -1 is not valid ''' info_run = """ - Dataset: **{0}** - Size: **{1}** """ self.run_specific = info_run.format(dts_name, dts_size) # self.title = dts_name models_name = ['dt', 'logit', 'linear_svc'] train_dicts = [] test_dicts = [] for model_name in models_name: dataset = get_data(dts_name, n_ins=dts_size) clf = get_model(model_name=model_name, sampler_name='identity', pca_bool=False, box_type='none') train_score, test_score = get_non_sampling_model_scores( clf, dataset) train_score = { 'absi': [0, 10], 'ord': [train_score, train_score], 'label': model_name, } test_score = { 'absi': [0, 10], 'ord': [test_score, test_score], 'label': model_name, } train_dicts.append(train_score) test_dicts.append(test_score) return train_dicts, test_dicts
def fun(dts_name): o_filename = f'experimental_results/random_forest/{dts_name}.json' if dts_name == 'mnist': data = get_mnist_data() elif dts_name == 'fashion_mnist': data = get_fashion_data() else: data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) rf = RandomForestClassifier(n_estimators=50) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] print('Empieza el experimento') time0 = time.perf_counter() rf.fit(data_train, target_train) time1 = time.perf_counter() c_time = time1 - time0 print('Termina el experimento') train_score = rf.score(data_train, target_train) test_score = rf.score(data_test, target_test) info = { "box_name": "none", "description": f"A normal RF ({dts_name})", "gamma": None, "label": "RF ", "model_name": "rf", "model_param": {}, "test_score": test_score, "time": c_time, "train_score": train_score, } print(info) with open(o_filename, 'w') as f: json.dump([info], f, indent=4, sort_keys=True)
def run_demo_non_sampling(self, model): # run_demo llamará a esta o a la otra dependiendo del tipo. ''' Parameters ---------- model : abstract model Something on which you can call fit and score Returns ------- (train_scores, test_scores) : tuple of list of dict dict with keys ['absi', 'ord', 'labels'] ''' train_scores = [] test_scores = [] for dts_name in self.all_datasets_names: dataset = get_data(dts_name) train_score, test_score = get_non_sampling_model_scores( model, dataset) train_scores.append(train_score) test_scores.append(test_score) train_dicts = [] test_dicts = [] for i, dts_name in enumerate(self.all_datasets_names): train_d = { 'absi': [0, 1], 'ord': [train_scores[i], train_scores[i]], 'label': dts_name, } test_d = { 'absi': [0, 1], 'ord': [test_scores[i], test_scores[i]], 'label': dts_name, } train_dicts.append(train_d) test_dicts.append(test_d) return train_dicts, test_dicts
def run_demo_non_sampling(self, dts_name, model_data): # run_demo llamará a esta o a la otra dependiendo del tipo. ''' Parameters ---------- dts_name : str model_data : dict Required keys: ['model_name', 'sampler_name', 'pca_bool', 'n_estim', 'box_type'] Almost everything get_model needs Returns ------- (train_scores, test_scores) : tuple of list of dict dict with keys ['absi', 'ord', 'labels'] ''' train_dicts = [] test_dicts = [] dataset = get_data(dts_name, n_ins=self.n_ins) for C in self.valores_C: model = get_model(C=C, **model_data) train_score, test_score = get_non_sampling_model_scores( model, dataset) train_d = { 'absi': [0, 1], 'ord': [train_score, train_score], 'label': 'C = {}'.format(C), } test_d = { 'absi': [0, 1], 'ord': [test_score, test_score], 'label': 'C = {}'.format(C), } train_dicts.append(train_d) test_dicts.append(test_d) return train_dicts, test_dicts
def run_demo_with_sampling(self, dts_name, dts_size, model_data, hparams, features): ''' Gets the score of many models, all equal (specified in model_data) except for the gamma value of the sampler, which uses self.gammas for each model. Parameters ---------- dts_name : str model_data : dict Data needed to generate a model. Required keys: ['model_name', 'sampler_name', 'pca_bool', 'n_estim', 'box_type'] features : list of int A list with real features to test with. Values of -1 are not allowed Returns ------- (train, test) : tuple of list of dict The scores for many models, which only disagree in the gamma value Keys of dict: ['absi', 'ord', 'label'] ''' train_dicts = [] test_dicts = [] dataset = get_data(dts_name, n_ins=dts_size) model_params = self.get_hparams(model_data['model_name'], hparams) for g in self.gammas: # model = get_model(gamma=g, **model_data) # TODO un poco cutre # model_params = self.get_hparams(model_data['model_name'], # hparams) # model_data tiene # 'model_name' # 'sampler_name' # 'pca_bool' # 'n_estim' # 'box_type' model = get_model(rbfsampler_gamma=g, nystroem_gamma=g, model_params=model_params, **model_data) train_score, test_score, errors =\ get_sampling_model_scores(model, dataset, features) train_score['label'] = 'gamma {}'.format(g) test_score['label'] = 'gamma {}'.format(g) train_dicts.append(train_score) test_dicts.append(test_score) # Ejecutar también el caso de no usar sampler # model_data['sampler_name'] = 'identity' m_data = dict(model_data) m_data['sampler_name'] = 'identity' model = get_model(rbfsampler_gamma=g, nystroem_gamma=g, model_params=model_params, **m_data) tr_score, te_score = get_non_sampling_model_scores(model, dataset) train_score = { 'absi': [features[0], features[-1]], 'ord': [tr_score, tr_score], 'label': 'No sampler' } test_score = { 'absi': [features[0], features[-1]], 'ord': [te_score, te_score], 'label': 'No sampler' } # train_score['label'] = 'No sampler' # test_score['label'] = 'No sampler' train_dicts.append(train_score) test_dicts.append(test_score) # return train_dicts, test_dicts # self.train_scores.append(train_dicts) # self.test_scores.append(test_dicts) self.train_scores = train_dicts self.test_scores = test_dicts
def run_demo(self, dts_name, dts_size, features_range, models): ''' Just reading from the arguments, returns a pair of list of dictionarys, with the scores of the demo. Pair is (train, test) Parameters ---------- dts_name : str dts_size : int features_range : list of int shape: [2], increasing order models : list of dict each dict is a model. Required keys: ['model_name', 'sampler', 'box_type', 'n_estim', 'pca'] Values of 'sampler' and 'box_type' are str or None Returns ------- (train_scores, test_scores) : tuple of list of dict Dict with keys ['absi', 'ord', 'label'] ''' info_run = ''' - Dataset: **{0}** - Size: **{1}** ''' self.run_specific = info_run.format(dts_name, dts_size) dataset = get_data(dts_name, n_ins=dts_size) train_scores = [] test_scores = [] for m in models: model_name = m['model_name'] sampler_name = m['sampler_name'] box_type = m['box_type'] n_estim = m['n_estim'] pca = m['pca'] if box_type == 'none': n_estim = None clf = get_model(model_name=model_name, sampler_name=sampler_name, pca_bool=pca, n_estim=n_estim, box_type=box_type) n_splits_features = 30 features = list(range(*features_range)) if (features_range[1] - features_range[0]) > n_splits_features: features = np.linspace(*features_range, num=n_splits_features, dtype=np.int).tolist() if sampler_name == 'identity': features = None if sampler_name is 'identity': # train_score y test_score son floats train_score, test_score =\ get_non_sampling_model_scores(clf, dataset) lab = self.get_label(model_name, sampler_name, box_type, n_estim, pca) train_score = { 'absi': features_range, 'ord': [train_score, train_score], 'label': lab, } test_score = { 'absi': features_range, 'ord': [test_score, test_score], 'label': lab, } else: # train_score y test_score son diccionarios train_score, test_score =\ get_sampling_model_scores(clf, dataset, features) lab = self.get_label(model_name, sampler_name, box_type, n_estim, pca) train_score['label'] = lab test_score['label'] = lab train_scores.append(train_score) test_scores.append(test_score) return train_scores, test_scores
def run_demo(self, dts_name, dts_size, features_range, models, hparams, rbfsampler_gamma, nystroem_gamma): ''' First it clears self.train_scores and self.test_scores, and then runs the demo, appending to those the results of each of the models. The results are in the shape of a dictionary, with keys ['absi', 'ord', 'label'] It is resistant to failing of some of the models. If that happens, a warning in raised, self.ERRORS is filled with some info, and the execution continues with the rest of the models. Parameters ---------- dts_name : str dts_size : int features_range : list of int shape: [2], increasing order models : list of dict each dict is a model. Required keys: ['model_name', 'sampler', 'box_type', 'n_estim', 'pca'] Values of 'sampler' and 'box_type' are str or None hparams : dict Required keys: ['dt', 'logit', 'linearsvc'] Returns ------- None ''' info_run = ''' - Dataset: **{0}** - Size: **{1}** ''' # self.run_specific = info_run.format(dts_name, dts_size) info_run = { 'Dataset': dts_name, 'Size': dts_size, 'RFF gamma': rbfsampler_gamma, 'Nystroem gamma': nystroem_gamma, 'DT max. depth': hparams['dt']['max_depth'], 'DT min. samples split': hparams['dt']['min_samples_split'], 'DT min. samples leaf': hparams['dt']['min_samples_leaf'], 'DT min. weight fraction leaf': hparams['dt']['min_weight_fraction_leaf'], 'DT max. leaf nodes': hparams['dt']['max_leaf_nodes'], 'DT min. impurity decrease': hparams['dt']['min_impurity_decrease'], 'Logit C': hparams['logit']['C'], # 'Linear SVC': hparams['linearsvc']['C'], 'Linear SVC': hparams['linear_svc']['C'], } self.run_specific = self.get_run_specific_widget(info_run) dataset = get_data(dts_name, n_ins=dts_size) # train_scores = [] # test_scores = [] self.train_scores.clear() self.test_scores.clear() for m in models: model_name = m['model_name'] sampler_name = m['sampler_name'] box_type = m['box_type'] n_estim = m['n_estim'] pca = m['pca'] pca_first = m['pca_first'] model_params = self.get_hparams(model_name, hparams) if box_type == 'none': n_estim = None # clf = get_model(model_name=model_name, model_params=model_params, # sampler_name=sampler_name, pca_bool=pca, # pca_first=pca_first, n_estim=n_estim, # box_type=box_type) clf = get_model(model_name=model_name, model_params=model_params, sampler_name=sampler_name, pca_bool=pca, pca_first=pca_first, n_estim=n_estim, box_type=box_type, rbfsampler_gamma=rbfsampler_gamma, nystroem_gamma=nystroem_gamma) n_splits_features = 30 features = list(range(*features_range)) if (features_range[1] - features_range[0]) > n_splits_features: features = np.linspace(*features_range, num=n_splits_features, dtype=np.int).tolist() if sampler_name == 'identity': features = None if sampler_name == 'identity': # train_score y test_score son floats train_score, test_score =\ get_non_sampling_model_scores(clf, dataset) lab = self.get_label(model_name, sampler_name, box_type, n_estim, pca, pca_first) train_score = { 'absi': features_range, 'ord': [train_score, train_score], 'label': lab, } test_score = { 'absi': features_range, 'ord': [test_score, test_score], 'label': lab, } else: # train_score y test_score son diccionarios train_score, test_score, errors =\ get_sampling_model_scores(clf, dataset, features) self.ERRORS.extend(errors) lab = self.get_label(model_name, sampler_name, box_type, n_estim, pca, pca_first) train_score['label'] = lab test_score['label'] = lab self.train_scores.append(train_score) self.test_scores.append(test_score)
def exp4_1(dts_name): exp_code = '4_1' model_name = 'dt' box_type = 'none' # dt_params = { # 'splitter': 'best', # 'max_features': 'sqrt', # } n_estim = None min_id = [0, .1, .2, .5, 1] tunning_params = {'min_impurity_decrease': min_id} model1_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'identity', 'pca_bool': False, 'pca_first': None, 'n_estim': None, 'box_type': 'none', } model2_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } model3_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] d1 = exp(model_info=model1_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'DT ({dts_name})') d2 = exp(model_info=model2_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'DT with RFF ({dts_name})') d3 = exp(model_info=model3_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'DT with Nystroem ({dts_name})') store_exp(d1, d2, d3, exp_code=exp_code, dts_name=dts_name)
def exp1_1(dts_name): exp_code = '1_1' # overfitting_gamma = 1000 # C_values = [10**i for i in range(4)] C_values = [0.5, 1, 5, 20, 50] tunning_params = {'C': C_values} box_type = 'none' model1_info = { 'model_name': 'rbf_svc', 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'identity', 'pca_bool': False, 'pca_first': None, 'n_estim': None, 'box_type': box_type, } model2_info = { 'model_name': 'linear_svc', 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': None, 'box_type': box_type, } model3_info = { 'model_name': 'linear_svc', 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': None, 'box_type': box_type, } data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] d1 = exp(model_info=model1_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'A normal RBF-SVC with gamest ({dts_name})') d2 = exp(model_info=model2_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'A normal linear-SVC with RFF ({dts_name})') d3 = exp(model_info=model3_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'A normal linear-SVC with Nystroem ({dts_name})') store_exp(d1, d2, d3, exp_code=exp_code, dts_name=dts_name)
def exp3_4(dts_name): exp_code = '3_4' model_name = 'linear_svc' # C_value = {'C': 1000} # box_type = 'black_bag' n_estim = 1 C_values = [10**i for i in range(4)] tunning_params = {'C': C_values} # tunning_params = {} model1_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': 'black_bag', } model2_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': 'black_bag', } model3_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': 'black_ens', } model4_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': 'black_ens', } data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] d1 = exp( model_info=model1_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear-SVC black_bag with RFF without regul. ({dts_name})' ) d2 = exp( model_info=model2_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear-SVC black_bag with Nys without regul. ({dts_name})' ) d3 = exp( model_info=model3_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear-SVC black_ens with RFF without regul. ({dts_name})' ) d4 = exp( model_info=model4_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear-SVC black_ens with Nys without regul. ({dts_name})' ) store_exp(d1, d2, d3, d4, exp_code=exp_code, dts_name=dts_name)
def exp2_1(dts_name): exp_code = '2_1' model_name = 'logit' C_value = {'C': 1000} box_type = 'none' n_estim = None # C_values = [10**i for i in range(4)] # tunning_params = {'C': C_values} tunning_params = {} model1_info = { 'model_name': model_name, 'model_params': C_value, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'identity', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } model2_info = { 'model_name': model_name, 'model_params': C_value, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } model3_info = { 'model_name': model_name, 'model_params': C_value, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] d1 = exp(model_info=model1_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Normal logit without regularization ({dts_name})') d2 = exp(model_info=model2_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Logit with RFF without regularization ({dts_name})') d3 = exp( model_info=model3_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Logit with Nystroem without regularization ({dts_name})') store_exp(d1, d2, d3, exp_code=exp_code, dts_name=dts_name)
def exp2_8(dts_name): exp_code = '2_8' model_name = 'linear_svc' C_value = {'C': 1000} box_type = 'grey_ens' n_estim = 50 C_values = [10**i for i in range(4)] tunning_params = {'C': C_values} # tunning_params = {} model1_info = { 'model_name': model_name, 'model_params': {}, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'identity', 'pca_bool': False, 'pca_first': None, 'n_estim': None, 'box_type': 'none', } model2_info = { 'model_name': model_name, 'model_params': C_value, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'rbf', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } model3_info = { 'model_name': model_name, 'model_params': C_value, 'rbfsampler_gamma': None, 'nystroem_gamma': None, 'sampler_name': 'nystroem', 'pca_bool': False, 'pca_first': None, 'n_estim': n_estim, 'box_type': box_type, } data = get_data(dataset_name=dts_name, prop_train=2 / 3, n_ins=5000) data_train = data['data_train'] data_test = data['data_test'] target_train = data['target_train'] target_test = data['target_test'] d1 = exp(model_info=model1_info, tunning_params=tunning_params, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear SVM ({dts_name})') d2 = exp(model_info=model2_info, tunning_params={}, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear SVM with RFF Black Bag ({dts_name})') d3 = exp(model_info=model3_info, tunning_params={}, data_train=data_train, data_test=data_test, target_train=target_train, target_test=target_test, description=f'Linear SVM with Nystroem Black Bag ({dts_name})') store_exp(d1, d2, d3, exp_code=exp_code, dts_name=dts_name)