def _build_model(self, x): if self.dataset == 'cifar10': transform = transforms.Compose( [transforms.ToTensor(), # convert to PIL-Image of range [0, 1] transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] # normalize to [-1, 1] ) self.trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_dir(), 'cifar10'), train=True, download=True, transform=transform) self.trainset.data = self.trainset.data[:1000] self.trainset.targets = self.trainset.targets[:1000] if self.verbose > 0: print(self.trainset) self.trainloader = torch.utils.data.DataLoader(self.trainset, batch_size=self.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root=os.path.join(data_dir(), 'cifar10'), train=False, download=True, transform=transform) testset.data = testset.data[:100] testset.targets = testset.targets[:100] if self.verbose > 0: print(testset) self.testloader = torch.utils.data.DataLoader(testset, batch_size=self.batch_size, shuffle=False, num_workers=2) self.classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') else: raise NotImplementedError self.net = ConvNet().to(self.device) if self.verbose > 0: print(self.net) self.criterion = nn.CrossEntropyLoss() self.optz = optim.SGD(self.net.parameters(), lr=self.learning_rate, momentum=self.momentum)
def test_visualization_2d( create_obj_func, show=False, block_figure_on_end=False, freq_predict_display=5, show_loss_display=False, grid_size=100, marker_size=10): file_name = os.path.join(data_dir(), 'demo/synthetic_2D_data_train.libsvm') x_train, y_train = load_svmlight_file(file_name) x_train = x_train.toarray() print('num_samples: {}'.format(x_train.shape[0])) predict_display = Display( freq=freq_predict_display, dpi='auto', show=show, block_on_end=block_figure_on_end, monitor=[{'metrics': ['predict'], 'title': "Visualization", 'xlabel': "X1", 'ylabel': "X2", 'grid_size': grid_size, 'marker_size': marker_size, 'left': None, 'right': None, 'top': None, 'bottom': None }] ) loss_display = Display( freq=1, dpi=72, show=show, block_on_end=block_figure_on_end, monitor=[{'metrics': ['train_loss'], 'type': 'line', 'title': "Learning losses", 'xlabel': "data points", 'ylabel': "loss", }] ) callbacks = [predict_display] if show_loss_display: callbacks.append(loss_display) users_params = { 'callbacks': callbacks, 'metrics': ['train_loss'], } learner = create_obj_func(users_params) learner.fit(x_train, y_train) y_train_pred = learner.predict(x_train) print("Training error = %.4f" % (1 - metrics.accuracy_score(y_train, y_train_pred)))
def test_kmm_syn2d(show=False, block_figure_on_end=False): print("========== Test KMM on 2D data ==========") np.random.seed(random_seed()) (x_train, y_train), (x_test, y_test) = demo.load_synthetic_2d() idx_train, idx_test = next( iter( StratifiedShuffleSplit(n_splits=1, test_size=40, random_state=random_seed()).split( x_train, y_train))) x0 = x_train[idx_train] y0 = y_train[idx_train] x1 = x_train[idx_test] y1 = y_train[idx_test] x = np.vstack([x0, x1]) y = np.concatenate([y0, y1]) early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1) filepath = os.path.join( model_dir(), "male/KMM/syn2d_data_{epoch:04d}_{val_err:.6f}.pkl") checkpoint = ModelCheckpoint(filepath, mode='min', monitor='val_err', verbose=0, save_best_only=True) display = Display(layout=(3, 1), dpi='auto', show=show, block_on_end=block_figure_on_end, monitor=[ { 'metrics': ['loss', 'val_loss'], 'type': 'line', 'labels': ["training loss", "validation loss"], 'title': "Learning losses", 'xlabel': "epoch", 'ylabel': "loss", }, { 'metrics': ['err', 'val_err'], 'type': 'line', 'title': "Learning errors", 'xlabel': "epoch", 'ylabel': "error", }, { 'metrics': ['err'], 'type': 'line', 'labels': ["training error"], 'title': "Learning errors", 'xlabel': "epoch", 'ylabel': "error", }, ]) clf = KMM(model_name="KMM_hinge", D=10, lbd=0.0, gamma=0.5, mode='batch', loss='hinge', num_kernels=4, batch_size=4, temperature=0.1, num_epochs=10, num_nested_epochs=0, learning_rate=0.001, learning_rate_mu=0.0, learning_rate_gamma=0.001, learning_rate_alpha=0.001, metrics=['loss', 'err'], callbacks=[display, early_stopping, checkpoint], cv=[-1] * x0.shape[0] + [0] * x1.shape[0], random_state=random_seed()) clf.fit(x, y) train_err = 1.0 - clf.score(x_train, y_train) print("Training error = %.4f" % train_err) if block_figure_on_end: # save predictions y_test_pred = clf.predict(x_test) x_test[x_test == 0] = 1e-4 dump_svmlight_file(x_test, y_test_pred, os.path.join( data_dir(), "demo/synthetic_2D_data_test_predict.libsvm"), zero_based=False)
def test_rsrbm_cd_on_full_20newsgroups_dataset(show_figure=False, block_figure_on_end=False): print( "========== Test ReplicatedSoftmaxRBM using Contrastive Divergence ==========" ) np.random.seed(random_seed()) from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier import os from sklearn.datasets import load_svmlight_file from male.configs import data_dir train_path = os.path.join( data_dir(), "20newsgroups/20news_bydate/libsvm/" "20news_bydate_5Kwordcount_in_entire_data_countfeat_train.libsvm") test_path = os.path.join( data_dir(), "20newsgroups/20news_bydate/libsvm/" "20news_bydate_5Kwordcount_in_entire_data_countfeat_test.libsvm") x_train, y_train = load_svmlight_file(train_path, n_features=5000) x_test, y_test = load_svmlight_file(test_path, n_features=5000) x_train = x_train.toarray() x_test = x_test.toarray() x_train, y_train = demo.shuffle(x_train, y_train, randseed=random_seed()) x = np.vstack([x_train, x_test]) y = np.concatenate([y_train, y_test]) learning_display = Display( title="Learning curves", dpi='auto', layout=(2, 1), freq=1, show=show_figure, block_on_end=block_figure_on_end, monitor=[ { 'metrics': ['recon_err', 'val_recon_err'], 'type': 'line', 'labels': ["training recon error", "validation recon error"], 'title': "Reconstruction Errors", 'xlabel': "epoch", 'ylabel': "error", }, { 'metrics': ['free_energy', 'val_free_energy'], 'type': 'line', 'title': "Free Energies", 'xlabel': "epoch", 'ylabel': "energy", }, ]) filter_display = Display(title="Receptive Fields", dpi='auto', layout=(1, 1), figsize=(8, 8), freq=1, show=show_figure, block_on_end=block_figure_on_end, monitor=[ { 'metrics': ['filters'], 'title': "Receptive Fields", 'type': 'img', 'num_filters': 10, 'disp_dim': (25, 20), 'tile_shape': (5, 2), }, ]) hidden_display = Display(title="Hidden Activations", dpi='auto', layout=(1, 1), figsize=(8, 8), freq=1, show=show_figure, block_on_end=block_figure_on_end, monitor=[ { 'metrics': ['hidden_activations'], 'title': "Hidden Activations", 'type': 'img', 'data': x_train[:200], }, ]) model = ReplicatedSoftmaxRBM( num_hidden=100, num_visible=5000, batch_size=128, num_epochs=100, # sparse_weight=0.3, # sparse_level=0.1, learning_rate=0.01, learning_rate_hidden=0.0001, momentum_method='sudden', weight_cost=2e-4, metrics=['recon_err', 'free_energy'], callbacks=[learning_display, filter_display, hidden_display], cv=[-1] * x_train.shape[0] + [0] * x_test.shape[0], random_state=random_seed(), verbose=1) model.fit(x) x_train1 = model.transform(x_train) x_test1 = model.transform(x_test) clf = KNeighborsClassifier(n_neighbors=1) clf.fit(x_train1, y_train) test_pred_err = 1.0 - accuracy_score(y_test, clf.predict(x_test1)) print("RSRBM->kNN: test error = %.4f" % test_pred_err)
def test_real_dataset(create_obj_func, data_name=None, show=False, block_figure_on_end=False): if data_name is None: if len(sys.argv) > 2: data_name = sys.argv[2] else: raise Exception('Not specify dataset') np.random.seed(1234) print("========== Test on real data ==========") train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm') test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm') print(train_file_name) print(test_file_name) if not os.path.exists(train_file_name): raise Exception('File not found') if not os.path.exists(test_file_name): raise Exception('File not found') x_train, y_train = load_svmlight_file(train_file_name) x_test, y_test = load_svmlight_file(test_file_name, n_features=x_train.shape[1]) users_params = dict() users_params = parse_arguments(users_params) print('users_params:', users_params) if 'sparse' not in users_params.keys(): x_train = x_train.toarray() x_test = x_test.toarray() x = np.vstack((x_train, x_test)) else: x = scipy.sparse.vstack((x_train, x_test)) y = np.concatenate((y_train, y_test)) cv = [-1] * x_train.shape[0] + [1] * x_test.shape[0] loss_display = Display( freq=1, dpi=72, show=show, block_on_end=block_figure_on_end, monitor=[{'metrics': ['train_loss', 'valid_loss'], 'type': 'line', 'title': "Learning losses in " + data_name, 'xlabel': "data points", 'ylabel': "loss", }] ) users_params['callbacks'] = [loss_display] users_params['metrics'] = ['train_loss', 'valid_loss'] users_params['cv'] = cv learner = create_obj_func(users_params) learner.fit(x, y) y_train_pred = learner.predict(x_train) print('y_train:', np.unique(y_train)) print('y_train_pred:', np.unique(y_train_pred)) y_test_pred = learner.predict(x_test) print('y_test:', np.unique(y_test)) print('y_test_pred:', np.unique(y_test_pred)) print("Training acc = %.4f" % (metrics.accuracy_score(y_train, y_train_pred))) print("Testing acc = %.4f" % (metrics.accuracy_score(y_test, y_test_pred)))
def run_grid_search_multicore( create_obj_func, params_gridsearch, attribute_names, dataset=None, num_workers=4, file_config=None, num_runs=3, cross=0, num_features=None, full_dataset=None, keep_vars=[], ind_test=None, max_torrance=1): if dataset is None: if len(sys.argv) > 2: dataset = sys.argv[2] else: raise Exception('Not specify dataset') params_gridsearch = parse_arguments(params_gridsearch, True) # print(params_gridsearch) file_config, params_gridsearch = extract_param('file_config', file_config, params_gridsearch) num_workers, params_gridsearch = extract_param('num_workers', num_workers, params_gridsearch) num_runs, params_gridsearch = extract_param('num_runs', num_runs, params_gridsearch) cross, params_gridsearch = extract_param('cross', cross, params_gridsearch) num_features, params_gridsearch = extract_param('num_features', num_features, params_gridsearch) full_dataset, params_gridsearch = extract_param('full_dataset', full_dataset, params_gridsearch) ind_test, params_gridsearch = extract_param('ind_test', ind_test, params_gridsearch) max_torrance, params_gridsearch = extract_param('max_torrance', max_torrance, params_gridsearch) if ind_test is not None: if full_dataset is None: ind_test = dataset else: ind_test = full_dataset if full_dataset is None: full_dataset = dataset if not os.path.exists(os.path.join(data_dir(), full_dataset + '_train.libsvm')): dataset_info = data_info(full_dataset) get_file(full_dataset, origin=dataset_info['origin'], untar=True, md5_hash=dataset_info['md5_hash']) candidate_params_lst = list(ParameterGrid(params_gridsearch)) grid_search = True if len(candidate_params_lst) == 1: grid_search = False pool = mp.Pool(num_workers) # maximum of workers result_lst = [] for candidate_params in candidate_params_lst: result = pool.apply_async( run_one_candidate, args=( create_obj_func, candidate_params, dataset, attribute_names, file_config, num_runs, cross, num_features, keep_vars, ind_test, grid_search, max_torrance, online), callback=log_result ) result_lst.append(result) for result in result_lst: result.get() pool.close() pool.join() if len(candidate_params_lst) > 1: print("========== FINAL RESULT ==========") if online: idx_best = np.argmin(np.array(mistake_rate_lst)) else: idx_best = np.argmax(np.array(test_acc_lst)) print('Data set: {}'.format(dataset)) print('Best testid: {}'.format(testid_lst[idx_best])) if online: print('Best mistake rate: {}'.format(mistake_rate_lst[idx_best])) else: print('Best err on training set: {}'.format(1-train_acc_lst[idx_best])) print('Best err on valid set: {}'.format(1-test_acc_lst[idx_best])) print('Best params: {}'.format(run_param_lst[idx_best])) if cross > 0: print('Run the best one') num_runs_for_best = num_runs if num_runs < 3: num_runs_for_best = 3 best_result = run_one_candidate( create_obj_func, run_param_lst[idx_best], full_dataset, attribute_names, file_config, num_runs_for_best, cross=0, num_features=num_features, keep_vars=keep_vars, online=online) # best_result['gridsearch_time'] = np.sum(np.array(time_lst)) log_result(best_result)
def run_one_candidate( create_obj_func, candidate_params, data_name, attribute_names, file_config, num_runs, cross, num_features, keep_vars, ind_test=None, grid_search=True, max_torrance=1, online=False): # print('OS:', os.name) if (os.name == "posix") and file_config is not None: print(file_config) file = open(file_config, 'r') cpu_config = file.read() file.close() os.system("taskset -p " + cpu_config + " %d" % os.getpid()) np.random.seed(random_seed()) train_acc_avg = 0 test_acc_avg = 0 train_time_avg = 0 mistake_rate_avg = 0 log_lst = [] total_runs = num_runs if cross > 0: total_runs = num_runs * cross for ri in range(total_runs): print('----------------------------------') if cross > 0: if ri % num_runs == 0: np.random.seed(1010) crossid = str(int(ri / num_runs)) print('Run #{0} - Cross #{1}:'.format(ri+1, crossid+1)) train_file_name = os.path.join(data_dir(), data_name + '_' + crossid + '.train.txt') test_file_name = os.path.join(data_dir(), data_name + '_' + crossid + '.test.txt') if ind_test is not None: ind_test_file_name = os.path.join(data_dir(), ind_test + '_test.libsvm') print('Train file:', train_file_name) print('Valid file:', test_file_name) if ind_test is not None: print(ind_test_file_name) else: print('Run #{0}:'.format(ri+1)) train_file_name = os.path.join(data_dir(), data_name + '_train.libsvm') test_file_name = os.path.join(data_dir(), data_name + '_test.libsvm') if not os.path.exists(train_file_name): print('File ' + train_file_name + 'not found') raise Exception('File ' + train_file_name + ' not found') if not os.path.exists(test_file_name): raise Exception('File ' + test_file_name + ' not found') if ind_test is not None: if not os.path.exists(ind_test_file_name): raise Exception('File ' + ind_test_file_name + 'not found') if num_features is None: x_train, y_train = load_svmlight_file(train_file_name) x_test, y_test = load_svmlight_file(test_file_name) if ind_test is not None: x_ind_test, y_ind_test = load_svmlight_file(ind_test_file_name) else: x_train, y_train = load_svmlight_file(train_file_name, n_features=num_features) x_test, y_test = load_svmlight_file(test_file_name, n_features=num_features) if ind_test is not None: x_ind_test, y_ind_test = load_svmlight_file(ind_test_file_name, n_features=num_features) if grid_search: print('Trial params:', dict2string(candidate_params)) learner = create_obj_func(candidate_params) if not hasattr(learner, 'sparse') or not learner.sparse: x_train = x_train.toarray() x_test = x_test.toarray() if ind_test is not None: x_ind_test = x_ind_test.toarray() if online: x_total = np.vstack((x_train, x_test)) y_total = np.concatenate((y_train, y_test)) print('Num total samples: {}'.format(x_total.shape[0])) print('Running ...') learner.fit(x_total, y_total) else: print('Num samples: {}'.format(x_train.shape[0])) print('Training ...') learner.fit(x_train, y_train) if online: mistake_rate = learner.mistake_rate mistake_rate_avg += mistake_rate else: y_train_pred = learner.predict(x_train) y_test_pred = learner.predict(x_test) train_labels, train_ycount = np.unique(y_train, return_counts=True) train_acc = metrics.accuracy_score(y_train, y_train_pred) train_acc_detail = np.diagonal(metrics.confusion_matrix(y_train, y_train_pred, train_labels)) / train_ycount test_labels, test_ycount = np.unique(y_test, return_counts=True) test_acc_detail = np.diagonal(metrics.confusion_matrix(y_test, y_test_pred, test_labels)) / test_ycount test_acc = metrics.accuracy_score(y_test, y_test_pred) if ind_test is not None: y_ind_test_pred = learner.predict(x_ind_test) ind_test_labels, ind_test_ycount = np.unique(y_ind_test, return_counts=True) ind_test_acc_detail = \ np.diagonal(metrics.confusion_matrix(y_ind_test, y_ind_test_pred, ind_test_labels)) / ind_test_ycount ind_test_acc = metrics.accuracy_score(y_ind_test, y_ind_test_pred) train_time = learner.train_time if online: print('Mistake rate: {0:.2f}%, Training time: {1} seconds'.format(mistake_rate*100, int(train_time))) else: if grid_search: print('Err on valid set: {0:.2f}%, Err on training set: {1:.2f}%, Training time: {2} seconds' .format(100 - test_acc * 100, 100 - train_acc * 100, int(train_time))) else: print('Err on testing set: {0:.2f}%, Err on training set: {1:.2f}%, Training time: {2} seconds' .format(100 - test_acc * 100, 100 - train_acc * 100, int(train_time))) train_acc_avg += train_acc test_acc_avg += test_acc train_time_avg += train_time log_lst.append({k: learner.__dict__[k] for k in attribute_names}) log_lst[len(log_lst) - 1]['dataset'] = data_name log_lst[len(log_lst) - 1]['train_time'] = train_time if online: log_lst[len(log_lst) - 1]['mistake_rate'] = mistake_rate else: log_lst[len(log_lst) - 1]['train_acc'] = train_acc log_lst[len(log_lst) - 1]['test_acc'] = test_acc log_lst[len(log_lst) - 1]['train_acc_detail'] = train_acc_detail log_lst[len(log_lst) - 1]['test_acc_detail'] = test_acc_detail if ind_test is not None: log_lst[len(log_lst) - 1]['independent_test_acc'] = ind_test_acc log_lst[len(log_lst) - 1]['independent_test_acc_detail'] = ind_test_acc_detail for key in keep_vars: candidate_params[key] = learner.__dict__[key] if not online: if (1-test_acc) > max_torrance: total_runs = 1 break if online: return mistake_rate_avg / total_runs, train_time_avg / total_runs, log_lst, grid_search, candidate_params else: return \ train_acc_avg / total_runs, test_acc_avg / total_runs, train_time_avg / total_runs, log_lst, \ grid_search, candidate_params