def make_dataframe_from_datasets(datasets: Dict[str, sklearn.utils.Bunch], force_good: bool = False) -> pd.DataFrame: """ Create a dataframe from the given dataset. Parameters ---------- dataset: scikit.utils.Bunch, required See https://scikit-learn.org/stable/modules/generated/sklearn.utils.Bunch.html force_good: bool, optional, default: False If True all dataset samples are marked as good, and they will skip some pipeline steps like dewarping an contrast augmentation. Returns ------- df A new dataframe with the following columns * 'filename', the path to image or pdf file, * 'target', the encoded values for target names, * 'target_name', the target name, * 'is_grayscale', whether the image is in greyscale, * 'is_good', whether the image is good, i.e., it does not require dewarping, * 'is_pdf', whether the image file is in pdf format, * 'subset', the dataset subset, one in 'train', 'test'. """ df = pd.DataFrame(columns=[ 'filename', 'target', 'target_name', 'is_grayscale', 'is_good', 'is_pdf', 'subset', 'is_valid' ]) for subset, dataset in datasets.items(): subset_df = pd.DataFrame( data={ 'filename': dataset.filenames, 'target': dataset.target, 'target_name': [dataset.target_names[target] for target in dataset.target], 'is_grayscale': [False for _ in dataset.filenames], 'is_good': [ is_good(filename, force_good) for filename in dataset.filenames ], 'is_pdf': [is_pdf(filename) for filename in dataset.filenames], 'is_valid': [is_valid(filename) for filename in dataset.filenames], 'subset': [subset for _ in dataset.filenames] }) df = df.append(subset_df, ignore_index=True) drop_invalid_values(df) drop_isvalid_column(df) return df
def main(args): data_path = 'data/' all_models_path = 'models/sk-rf/nature/' log_file_path = 'logs/train_regular_rf_all.log' logging.basicConfig( filename=log_file_path, filemode='a', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, ) logger = logging.getLogger('regular.rf') logger.info("Starting training regular sklearn RF models...") # command for training one ''' python train_rf_one.py --train data/binary_mnist0 --test data/binary_mnist0.t -m models/rf/nature/sklearn_nature_binary_mnist.pickle -b -z -c gini -n 784 --nt 60 -d 8 ''' for dataset, fname in datasets.items(): # track time for each one start = time.time() train, test = fname train_path = data_path + train test_path = data_path + test model_path = all_models_path + 'sklearn_nature_' + dataset + '.pickle' options = '' if binary_class[dataset] is True: options += '-b ' if zero_based[dataset] is True: options += '-z ' # add number of features for the dataset options += '-n %s ' % n_feat[dataset] # use the regular 'best' splitter options += '-s best ' # use gini for now options += '-c gini ' # batch tree number and max max_depth options += '--nt %s -d %s' % (tree_size[dataset][0], tree_size[dataset][1]) cmd = 'python train_rf_one.py --train %s --test %s -m %s %s' \ % (train_path, test_path, model_path, options) logging.info(cmd) os.system(cmd) end = time.time() logging.info('time in seconds: %f' % (end - start)) return
def test_do_dummy_prediction(self): datasets = { 'breast_cancer': BINARY_CLASSIFICATION, 'wine': MULTICLASS_CLASSIFICATION, 'diabetes': REGRESSION, } for name, task in datasets.items(): backend_api = self._create_backend('test_do_dummy_prediction') X_train, Y_train, X_test, Y_test = putil.get_dataset(name) datamanager = XYDataManager( X_train, Y_train, X_test, Y_test, task=task, dataset_name=name, feat_type=None, ) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25, metric=accuracy, ) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend.save_datamanager(datamanager) D = backend_api.load_datamanager() # Check if data manager is correcly loaded self.assertEqual(D.info['task'], datamanager.info['task']) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the temporary directory. self.assertFalse( os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue( os.path.exists( os.path.join(backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_1_0.0.npy'))) del auto self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def loop_cv_score_datasets(clf, datasets, scoring="average_precision", cv=3): """ loops over a set of training data (drop, simple imputation etc) and calculates cross_val_score of clf e.g. datasets={'drop': ( X_train_drop, y_train_drop), 'simpl.imp.': (X_train_imp, y_train) } """ score_list = [] for k, v in datasets.items(): print(k) score_list.append( score_classifier(clf, v[0], v[1], k, scoring=scoring, cv=cv)) return (pd.concat(score_list, axis=1))
'knn': {'n_neighbors': [1, 3, 5, 7, 10, 20]}, 'tree': {'max_depth': [None, 3, 5, 10]}, 'neural': {'max_iter': [50, 100, 200], 'hidden_layer_sizes': [(15,)]}, 'forest': {'n_estimators': [10, 20, 50, 100]} } std_frame = pd.DataFrame(index=classifiers.keys(), columns=['media', 'dp', 'scores']) dataset_frames = { 'iris': std_frame.copy(), 'digits': std_frame.copy(), 'wine': std_frame.copy(), 'breast-cancer': std_frame.copy() } for dataset_name, dataset in datasets.items(): for classifier_name, classifier in classifiers.items(): gs = GridSearchCV(estimator=classifier, param_grid=params[classifier_name], scoring='accuracy', n_jobs=-1, cv=4) scores = cross_val_score(estimator=gs, X=dataset.data, y=dataset.target, scoring='accuracy', cv=10) dataset_frames[dataset_name].loc[classifier_name] = [scores.mean(), scores.std(), scores] for dataset_name, frame in dataset_frames.items(): print(dataset_name) print(frame, end='\n\n') boxplot = pd.DataFrame() for classifier_name, classifier in classifiers.items(): boxplot[classifier_name] = frame.at[classifier_name, 'scores'] sns.boxplot(data=boxplot, showmeans=True)
def main(): # results_dir = "./nonstationary" results_dir = "./prova" if not os.path.exists(results_dir): os.makedirs(results_dir) logging.basicConfig(filename=os.path.join(results_dir, 'results.log'), level=logging.INFO) n_samples = 200 x_square = np.random.uniform(0, 1, (n_samples, 2)) x_square_1 = x_square.copy() x_square_1[:, 1] = x_square[:, 1] + 2 x_square_2 = x_square.copy() x_square_2[:, 0] = x_square[:, 0] + 2 x_square_3 = x_square.copy() x_square_3[:, 0] = x_square[:, 0] + 2 x_square_3[:, 1] = x_square[:, 1] + 2 x_square = np.concatenate([x_square, x_square_1, x_square_2, x_square_3]) y_square = np.zeros((x_square.shape[0], ), dtype='int') L = len(x_square_1) y_square[L:2 * L] = 1 y_square[2 * L:3 * L] = 2 y_square[3 * L:4 * L] = 3 # plt.figure() # plt.scatter(x_square[:, 0], x_square[:, 1], c=y_square) # plt.show() # return datasets = { "square": (x_square, y_square), } bar_position = 0 progress_bar = tqdm(datasets.items(), position=bar_position) for dataset, data in progress_bar: progress_bar.set_description("Analysis of dataset: %s" % dataset) X, y = data X = StandardScaler().fit_transform(X) N = 30 num_epochs = 400 lr_dual = 0.0008 lr_standard = 0.008 lmb_dual = 0.01 lmb_standard = 0.01 repetitions = 10 kmeans_losses = [] mlp_losses = [] mlp_loss_Q = [] mlp_loss_E = [] mlp_time = [] mlp_nodes = [] trans_losses = [] trans_loss_Q = [] trans_loss_E = [] trans_time = [] trans_nodes = [] steps = [] for i in range(repetitions): model_mlp = DeepCompetitiveLayerNonStationary( verbose=False, lmb=lmb_standard, N=N, num_epochs=num_epochs, lr=lr_standard) start_time = time.time() model_mlp.fit(X, y) mlp_time.append(time.time() - start_time) model_mlp.compute_graph() model_mlp.plot_graph( X, y, os.path.join(results_dir, f"{dataset}_{i}_standard.pdf")) model_mlp.plot_graph( X, y, os.path.join(results_dir, f"{dataset}_standard.png")) mlp_losses.extend(model_mlp.loss_vals) mlp_loss_Q.extend(model_mlp.loss_Q_) mlp_loss_E.extend(model_mlp.loss_E_) mlp_nodes.extend(model_mlp.node_list_) model_trans = DeepTopologicalNonstationaryClustering( verbose=False, lmb=lmb_dual, N=N, num_epochs=num_epochs, lr=lr_dual) start_time = time.time() model_trans.fit(X, y) trans_time.append(time.time() - start_time) model_trans.compute_graph() model_trans.plot_graph( X, y, os.path.join(results_dir, f"{dataset}_{i}_dual.pdf")) model_trans.plot_graph( X, y, os.path.join(results_dir, f"{dataset}_dual.png")) trans_losses.extend(model_trans.loss_vals) trans_loss_Q.extend(model_trans.loss_Q_) trans_loss_E.extend(model_trans.loss_E_) trans_nodes.extend(model_trans.node_list_) steps.extend(np.arange(0, len(model_trans.loss_vals))) losses = pd.DataFrame({ 'epoch': steps, 'standard': mlp_losses, 'dual': trans_losses, }) sns.set_style('whitegrid') plt.figure(figsize=[4, 3]) sns.lineplot('epoch', 'standard', data=losses, label='standard', ci=99) sns.lineplot('epoch', 'dual', data=losses, label='dual', ci=99) # plt.yscale('log') plt.ylabel('loss') plt.title(f'{dataset}') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2) plt.tight_layout() plt.savefig(os.path.join(results_dir, f'{dataset}_loss.png')) plt.savefig(os.path.join(results_dir, f'{dataset}_loss.pdf')) plt.show() losses_Q = pd.DataFrame({ 'epoch': steps, 'standard': mlp_loss_Q, 'dual': trans_loss_Q, }) sns.set_style('whitegrid') plt.figure(figsize=[4, 3]) sns.lineplot('epoch', 'standard', data=losses_Q, label='standard', ci=99) sns.lineplot('epoch', 'dual', data=losses_Q, label='dual', ci=99) # plt.yscale('log') plt.ylabel('quantization error') plt.title(f'{dataset}') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2) plt.tight_layout() plt.savefig(os.path.join(results_dir, f'{dataset}_loss_q.png')) plt.savefig(os.path.join(results_dir, f'{dataset}_loss_q.pdf')) plt.show() losses_E = pd.DataFrame({ 'epoch': steps, 'standard': mlp_loss_E, 'dual': trans_loss_E, }) sns.set_style('whitegrid') plt.figure(figsize=[4, 3]) sns.lineplot('epoch', 'standard', data=losses_E, label='standard', ci=99) sns.lineplot('epoch', 'dual', data=losses_E, label='dual', ci=99) # plt.yscale('log') plt.ylabel('topological complexity') plt.title(f'{dataset}') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2) plt.tight_layout() plt.savefig(os.path.join(results_dir, f'{dataset}_loss_e.png')) plt.savefig(os.path.join(results_dir, f'{dataset}_loss_e.pdf')) plt.show() # nodes = pd.DataFrame({ # 'epoch': steps, # 'standard': mlp_nodes, # 'dual': trans_nodes, # }) # # sns.set_style('whitegrid') # plt.figure(figsize=[4, 3]) # sns.lineplot('epoch', 'standard', data=nodes, label='standard', ci=99) # sns.lineplot('epoch', 'dual', data=nodes, label='dual', ci=99) # # plt.yscale('log') # plt.ylabel('number of prototypes') # plt.title(f'{dataset}') # plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2) # plt.tight_layout() # plt.savefig(os.path.join(results_dir, f'{dataset}_nodes.png')) # plt.savefig(os.path.join(results_dir, f'{dataset}_nodes.pdf')) # plt.show() logging.info(f'Analyzing dataset: {dataset}') logging.info( f'Standard competitive layer elapsed time: {np.mean(mlp_time):.2f} +- {np.std(mlp_time):.2f}' ) logging.info( f'Dual layer elapsed time: {np.mean(trans_time):.2f} +- {np.std(trans_time):.2f}' )
def main(): logging.basicConfig(filename='log', level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') numpy.random.seed(0) options = dict(cv=10, scoring=[ 'accuracy', 'average_precision', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc' ], verbose=2, logbook=dict(log=True, filename="results/Log-%d.xlsx" % (time.time()), scores_worksheet="Scores", optimizations_worksheet="Optimizations", estimator_file="models/%s-%s-%s.pkl")) logbook = xl.Workbook() scores_worksheet = logbook.create_sheet( title=options['logbook']['scores_worksheet']) optimizations_worksheet = logbook.create_sheet( title=options['logbook']['optimizations_worksheet']) scores_worksheet.append([ 'Time', 'Dataset', 'Classifier', 'Scoring', 'Scores', 'Mean Score', 'Std Dev' ]) optimizations_worksheet.append([ 'Time', 'Dataset', 'Classifier', 'Scoring', 'Best Score', 'Best Params', 'Best Estimator', 'File' ]) datasets = { 'Heart Disease': { 'filename': 'datasets/cleveland.csv', 'drop_na': True, }, 'Diabetes': { 'filename': 'datasets/diabetes.csv', 'replace_zero_values': ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] }, } classifiers = { 'Logistic Regression': { 'clf': linear_model.LogisticRegression(random_state=37, C=0.13, penalty='l1'), 'cv_params': dict(C=numpy.arange(0.1, 5, 0.01).tolist(), penalty=['l1', 'l2']), 'optimize': True, 'random': False }, 'Linear SVC': { 'clf': svm.LinearSVC(random_state=37, C=18.09), 'cv_params': dict(C=numpy.arange(0.1, 50, 0.01).tolist()), 'optimize': True, 'random': False }, 'Naive Bayes': { 'clf': naive_bayes.GaussianNB(), 'cv_params': dict(), 'optimize': True, 'random': False }, 'K-Nearest Neighbors': { 'clf': neighbors.KNeighborsClassifier(algorithm='brute', n_jobs=-1, n_neighbors=13, weights='uniform'), 'cv_params': dict(n_neighbors=numpy.arange(1, 50).tolist(), weights=['uniform', 'distance']), 'optimize': True, 'random': False }, 'MLP Classifier': { 'clf': neural_network.MLPClassifier( random_state=37, learning_rate_init=0.026958815931057856, learning_rate='constant', hidden_layer_sizes=(29, 26, 5), activation='identity', alpha=16.681005372000556, max_iter=5000), 'cv_params': dict(hidden_layer_sizes=GenerateNeurons(2), activation=['identity', 'logistic', 'tanh', 'relu'], learning_rate=['constant', 'invscaling', 'adaptive'], alpha=numpy.logspace(-5, 3, 5), learning_rate_init=stats.uniform(0.001, 0.05)), 'optimize': True, 'random': True, 'random_iterations': 10000 }, } try: for dataset_name, dataset_options in datasets.items(): logging.info("Dataset: %s" % (dataset_name)) dataframe = LoadDataset(dataset_options['filename']) standard_scalar, features, predictions = ProcessData( dataframe, dataset_options) for classifier_name, classifier in classifiers.items(): logging.info("-- Processing: %s ---" % (classifier_name)) if not classifier['optimize']: scoring_options = options['scoring'] if type( options['scoring']) is list else [options['scoring']] for scoring in scoring_options: if scoring is 'neg_log_loss' and not hasattr( classifier['clf'], 'predict_proba'): logging.info("--- --- Skipping %s for %s" % (scoring, classifier_name)) continue scores = model_selection.cross_val_score( classifier['clf'], features, predictions, cv=options['cv'], scoring=scoring) scores_worksheet.append([ time.time(), dataset_name, classifier_name, scoring, str(scores), scores.mean(), scores.std() ]) logging.info("--- --- %.2f%% %s" % (scores.mean() * 100, scoring)) else: logging.info("--- --- Optimizing Hyper-Parameters") scoring = options['scoring'][0] if type( options['scoring']) is list else options['scoring'] grid = model_selection.GridSearchCV( classifier['clf'], classifier['cv_params'], cv=options['cv'], scoring=scoring, n_jobs=-1, verbose=options['verbose']) if not classifier[ 'random'] else model_selection.RandomizedSearchCV( classifier['clf'], classifier['cv_params'], n_iter=classifier['random_iterations'], cv=options['cv'], scoring=scoring, n_jobs=-1, verbose=options['verbose']) grid.fit(features, predictions) dump_file = options['logbook']['estimator_file'] % ( dataset_name, classifier_name, time.time()) joblib.dump(grid.best_estimator_, dump_file) optimizations_worksheet.append([ time.time(), dataset_name, classifier_name, scoring, grid.best_score_, str(grid.best_params_), str(grid.best_estimator_), dump_file ]) logging.info( "--- --- %.2f%% %s with %s" % (grid.best_score_ * 100, scoring, classifier_name)) for key, value in grid.best_params_.items(): logging.info("--- --- -- %s: %s" % (key, value)) logging.info("--- Finished Processing") except KeyboardInterrupt: logging.error("XXXXX Keyboard Interrupt XXXXX") finally: logging.info("Saving Worksheet %s" % (options['logbook']['filename'])) logbook.save(filename=options['logbook']['filename'])
n_components=n_components) methods['MDS'] = manifold.MDS(n_components=n_components, max_iter=100, n_init=1) k_methods = dict() k_methods["K_Isomap"] = drm.k_isomap k_methods["K_LLE"] = drm.k_lle k_methods["K_LE"] = drm.k_le k_methods["K_MDS"] = drm.k_mds for label, method in methods.items(): Y_conv = method.fit_transform(X) kstr = "K_" + label Y_k = k_methods[kstr](X, n_components, n_neighbors) curve_data, area, cname = quality_curve(X, Y_conv, n_neighbors, 'r', False) curve_data2, area2, _ = quality_curve(X, Y_k, n_neighbors, 'r', False) draw_curve(curve_data, area, curve_data2, area2, cname, label, data_name, n_neighbors) #methods for data_name, X in datasets.items(): nsamples = len(X) n_components = 2 n_neighbors = 20 preform_methods(X, data_name, nsamples, n_components, n_neighbors)
def main(): results_dir = "cole" if not os.path.exists(results_dir): os.makedirs(results_dir) n_samples = 30 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, centers=3, random_state=8) no_structure = np.random.rand(n_samples, 2), None random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) y = np.asarray([i for i in range(1000)]) x = np.asarray([i % 4 for i in range(1000)]) X_gabri = np.vstack([x, y]).T y_gabri = x # plt.figure() # sns.scatterplot(X_gabri[:, 0], X_gabri[:, 1], hue=y) # plt.show() theta = np.radians(np.linspace(30, 360 * 4, n_samples)) r = theta**2 x_2 = r * np.cos(theta) y_2 = r * np.sin(theta) X_spiral = np.vstack([x_2, y_2]).T y_spiral = np.zeros(len(X_spiral)) # plt.figure() # plt.scatter(x_2, y_2) # plt.show() (x_train, y_train), (x_test, y_test) = load_data() x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1] * x_test.shape[1])) x_digits, y_digits = load_digits(return_X_y=True) datasets = { # "Spiral": (X_spiral, y_spiral), # "digits": (x_digits, y_digits), # "mnist": (x_test[:5000], y_test[:5000]), # "gabri": (X_gabri, y_gabri), # "noisy_circles": noisy_circles, # "noisy_moons": noisy_moons, "blobs": blobs, # "aniso": aniso, # "varied": varied, } bar_position = 0 progress_bar = tqdm(datasets.items(), position=bar_position) for dataset, data in progress_bar: progress_bar.set_description("Analysis of dataset: %s" % dataset) X, y = data # if dataset == 'mnist': # mnist_file = os.path.join(results_dir, 'mnist.csv') # if not os.path.isfile(mnist_file): # X2 = X / 255 # X3 = np.expand_dims(X2, axis=3) # X4 = np.append(X3, X3, axis=3) # X4 = np.append(X4, X3, axis=3) # XZ = np.zeros((X4.shape[0], 32, 32, 3)) # XZ[:, 2:30, 2:30, :] = X4 # IMG_SHAPE = (32, 32, 3) # pretrained_model = tf.keras.applications.MobileNet(input_shape=IMG_SHAPE, # include_top=False, # weights='imagenet') # # Unfreeze the base model # pretrained_model.trainable = True # inputs = Input(shape=IMG_SHAPE) # x = pretrained_model(inputs, training=True) # x = GlobalAveragePooling2D()(x) # outputs = Dense(len(set(y)))(x) # pre_model = Model(inputs, outputs) # pre_model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), # Very low learning rate # loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), # metrics=[tf.keras.metrics.CategoricalAccuracy()]) # # # Train end-to-end. Be careful to stop before you overfit! # pre_model.fit(XZ, to_categorical(y), epochs=70, batch_size=200) # # preds = pretrained_model.predict(XZ) # X = np.reshape(preds, (preds.shape[0], preds.shape[-1])) # Xpd = pd.DataFrame(X) # Xpd.to_csv(mnist_file) # # else: # X = pd.read_csv(mnist_file, index_col=0).values # # # # Freeze base model # # pretrained_model.trainable = False # # # Create new model on top. # # inputs = Input(shape=IMG_SHAPE) # # x = pretrained_model(inputs, training=False) # # x = GlobalAveragePooling2D()(x) # # outputs = Flatten()(x) # # X = XZ X = StandardScaler().fit_transform(X) n = X.shape[0] d = X.shape[1] latent_dim = 2 k = 3 lr = 0.008 epochs = 800 lbd = 0.01 # inputs = Input(shape=(d,), name='input') # outputs = inputs # model = BaseModel(n_features=d, k_prototypes=k, inputs=inputs, outputs=outputs, deep=False) # optimizer = tf.keras.optimizers.Adam(learning_rate=0.008) # model.compile(optimizer=optimizer) # model.summary() # model.fit(X, y, epochs=epochs, verbose=True) # x_pred = model.predict(X) # prototypes = model.base_model.weights[-1].numpy() # # G = compute_graph(x_pred, prototypes) # # plt.figure() # # plot_confusion_matrix(x_pred, prototypes, y) # # plt.show() # plt.figure() # scatterplot(x_pred, prototypes, y, valid=False) # plt.savefig(os.path.join(results_dir, dataset + ".png")) # plt.show() inputs = Input(shape=(d, ), name='input') outputs = inputs # x = Dense(512)(inputs) # outputs = Dense(256)(x) # x = Dense(128)(x) # outputs = Dense(64)(x) model = DualModel(n_samples=n, k_prototypes=k, inputs=inputs, outputs=outputs, deep=False) optimizer = tf.keras.optimizers.Adam(learning_rate=lr) model.compile(optimizer=optimizer) model.summary() model.fit(X, y, epochs=epochs)
def main(): results_dir = "./paper0" if not os.path.exists(results_dir): os.makedirs(results_dir) logging.basicConfig(filename=os.path.join(results_dir, 'results.log'), level=logging.INFO) n_samples = 500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) y = np.asarray([i for i in range(1000)]) x = np.asarray([i % 4 for i in range(1000)]) X_gabri = np.vstack([x, y]).T y_gabri = x theta = np.radians(np.linspace(30, 360 * 4, n_samples)) r = theta**2 x_2 = r * np.cos(theta) y_2 = r * np.sin(theta) X_spiral = np.vstack([x_2, y_2]).T y_spiral = np.zeros(len(X_spiral)) # plt.figure() # plt.scatter(x_2, y_2) # plt.show() Xl, yl = make_classification(n_samples=n_samples, n_features=2, class_sep=25, n_informative=2, n_redundant=0, hypercube=False, n_classes=4, n_clusters_per_class=1, random_state=42) Xh, yh = make_classification(n_samples=n_samples, n_features=3000, class_sep=8, n_informative=10, n_redundant=2990, hypercube=False, n_classes=4, n_clusters_per_class=1, random_state=42) x_digits, y_digits = load_digits(return_X_y=True) datasets = { # "digits": (x_digits, y_digits), # "Spiral": (X_spiral, y_spiral), # "Circles": noisy_circles, "Moons": noisy_moons, # "Blobs (low)": (Xl, yl), # "Blobs (high)": (Xh, yh), # "gabri": (X_gabri, y_gabri), # "Ellipsoids": aniso, # "Blobs": blobs, # "varied": varied, } bar_position = 0 progress_bar = tqdm(datasets.items(), position=bar_position) for dataset, data in progress_bar: progress_bar.set_description("Analysis of dataset: %s" % dataset) X, y = data X = StandardScaler().fit_transform(X) # u, s, vh = np.linalg.svd(X) # print(f'dataset: {dataset} | max s: {np.max(s)} - min s: {np.min(s)}') # continue ns, nf = X.shape k = 40 epochs = 800 lr_dual = 0.000008 lr_base = 0.008 lmb_dual = 0.01 lmb_standard = 0.01 repetitions = 1 kmeans_losses = [] mlp_losses = [] mlp_loss_Q = [] mlp_loss_E = [] mlp_time = [] mlp_nodes = [] trans_losses = [] trans_loss_Q = [] trans_loss_E = [] trans_time = [] trans_nodes = [] steps = [] for i in range(repetitions): inputs = Input(shape=(nf, ), name='input') vanilla = BaseModel(n_features=nf, k_prototypes=k, deep=False, inputs=inputs, outputs=inputs) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_base) vanilla.compile(optimizer=optimizer) # model.layers[1].summary() vanilla.fit(X, y, epochs=epochs, verbose=False) prototypes = vanilla.base_model.weights[-1].numpy() # plt.figure(figsize=[4, 3]) # dynamic_decay(X, model.prototypes_[:200], dim=0, valid=True, scale='log') # plt.savefig(f'{dataset}_decayX_vanilla.pdf') # plt.savefig(f'{dataset}_decayX_vanilla.png') # plt.show() # plt.figure(figsize=[4, 3]) # dynamic_decay(X, vanilla.prototypes_[:40], valid=True, scale='log') # plt.savefig(f'{dataset}_decayY_vanilla.pdf') # plt.savefig(f'{dataset}_decayY_vanilla.png') # plt.show() # plt.figure() # scatterplot_dynamic(X, model.prototypes_, y, valid=True) # plt.savefig(f'{dataset}_dynamic_vanilla.pdf') # plt.savefig(f'{dataset}_dynamic_vanilla.png') # plt.show() # Dual print("Dual Model") inputs = Input(shape=(nf, ), name='input') model = DualModel(n_samples=ns, k_prototypes=k, deep=False, inputs=inputs, outputs=inputs) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_dual) model.compile(optimizer=optimizer) model.fit(X, y, epochs=epochs, verbose=False) x_pred = model.predict(X) prototypes = model.dual_model.predict(x_pred.T) # plt.figure(figsize=[4, 3]) # dynamic_decay(X, model.prototypes_[:200], dim=0, valid=True, scale='log') # plt.savefig(f'{dataset}_decayX_dual.pdf') # plt.savefig(f'{dataset}_decayX_dual.png') # plt.show() plt.figure(figsize=[4, 3]) dynamic_decay(X, vanilla.prototypes_, is_dual=False, valid=True, scale='log', c='b') dynamic_decay(X, model.prototypes_, valid=True, scale='log', c='r') plt.savefig(f'{dataset}_decayY_dual.pdf') plt.savefig(f'{dataset}_decayY_dual.png') plt.show()
def main(): results_dir = "results" if not os.path.exists(results_dir): os.makedirs(results_dir) n_samples = 500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) y = np.asarray([i for i in range(1000)]) x = np.asarray([i % 4 for i in range(1000)]) X_gabri = np.vstack([x, y]).T y_gabri = x # plt.figure() # sns.scatterplot(X_gabri[:, 0], X_gabri[:, 1], hue=y) # plt.show() (x_train, y_train), (x_test, y_test) = load_data() x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1] * x_test.shape[1])) x_digits, y_digits = load_digits(return_X_y=True) datasets = { "digits": (x_digits, y_digits), "mnist": (x_test, y_test), "gabri": (X_gabri, y_gabri), "noisy_circles": noisy_circles, "noisy_moons": noisy_moons, "blobs": blobs, "aniso": aniso, "varied": varied, } # for dataset, data in datasets.items(): # d = pd.DataFrame(data) # d.to_csv(f"{dataset}.csv") # return bar_position = 0 progress_bar = tqdm(datasets.items(), position=bar_position) for dataset, data in progress_bar: progress_bar.set_description("Analysis of dataset: %s" % dataset) X, y = data if dataset == 'mnist': X2 = X / 255 X3 = np.expand_dims(X2, axis=3) X4 = np.append(X3, X3, axis=3) X4 = np.append(X4, X3, axis=3) XZ = np.zeros((X4.shape[0], 32, 32, 3)) XZ[:, 2:30, 2:30, :] = X4 IMG_SHAPE = (32, 32, 3) base_model = tf.keras.applications.MobileNet(input_shape=IMG_SHAPE, include_top=False, weights='imagenet') preds = base_model.predict(XZ) preds = np.reshape(preds, (preds.shape[0], preds.shape[-1])) X = preds X = StandardScaler().fit_transform(X) N = 500 model = DeepTopologicalClustering(verbose=False, N=N, num_epochs=400, lr=0.0008) model.fit(X) accuracy = model.score(y) print(f'Accuracy: {accuracy:.4f}') title = f'Accuracy: {accuracy:.4f}' model.plot_confusion_matrix( y, title, os.path.join(results_dir, f"{dataset}_confmat.pdf"))