def main(): # Data processing df = dp.load_data('pol_regression.csv') x_train, y_train, x_test, y_test = dp.split_data(df) # Create an array to represent the different test errors of each degree train_error = [] test_error = [] # Plot ground truth plt.figure() plt.ylim(0, 1.5) #plt.plot(x_train, y_train, 'bo') plt.plot(x_test, y_test, 'bo') colors = ['r', 'y', 'b', 'c', 'k', 'm', 'g'] # Perform polynomial regression for powers 0 to 10 for i, degree in enumerate([0, 1, 2, 3, 4, 5, 10]): w = 1 if degree != 0: # Calculate the coefficients based on the training values w = pol.pol_regression(x_train, y_train, degree) # Make predictions for test data y_train_hat = pol.prediction(x_train, w, degree) y_test_hat = pol.prediction(x_test, w, degree) # Plot predictions list = zip(*sorted(zip(*(x_test, y_test_hat)))) plt.plot(*list, color=colors[i]) # Measure accuracy of model # RMSE of training set train_error.append( pol.eval_pol_regression(y_train_hat, w, x_train, y_train, degree)) # RMSE of testing set test_error.append( pol.eval_pol_regression(y_test_hat, w, x_test, y_test, degree)) print("[Degree: {0}] - Train: {1:.4f}, Test: {2:.4f}".format( degree, train_error[i], test_error[i])) plt.legend(('ground truth', '$x^0$', '$x$', '$x^2$', '$x^3$', '$x^4$', '$x^5$', '$x^{10}$'), loc='lower right') plt.savefig(os.path.join('images', 'polynomial_split.png')) pol.plot_error_graph(train_error, test_error)
def main(): # data generator data_generator = ImageDataGenerator( featurewise_center=False, featurewise_std_normalization=False, rotation_range=0, width_shift_range=0.1, height_shift_range=0.1, zoom_range=.1, horizontal_flip=True) model = CNN() opt = optimizers.Adam(lr=0.0001) # opt = optimizers.SGD(lr=0.001) model.compile(opt, loss='categorical_crossentropy', metrics=['accuracy']) model.summary() # callbacks f = open(base_path + 'gender_classification_training.log', 'w') f.close() log_file_path = base_path + 'gender_classification_training.log' csv_logger = CSVLogger(log_file_path, append=False) early_stop = EarlyStopping('val_loss', patience=patience) reduce_lr = ReduceLROnPlateau('val_loss', factor=0.1, patience=int(patience/4), verbose=1) trained_models = base_path + 'CNN.{epoch:02d}-{val_loss:.3f}-{val_acc:.2f}.hdf5' # model_cp = ModelCheckpoint(trained_models, 'val_acc', verbose=1, save_best_only=True) model_cp = ModelCheckpoint(trained_models, 'val_loss', verbose=1, save_best_only=True) callbacks = [model_cp, csv_logger, early_stop, reduce_lr] # load data faces, labels = load_data(data_path) print (len(faces)) print (len(labels)) faces = preprocess_input(faces) order = np.argsort(np.random.random(len(faces))) faces = faces[order] labels = labels[order] train_data, val_data = split_data(faces, labels, validation_split) train_faces, train_labels = train_data model.fit_generator(data_generator.flow(train_faces, train_labels, batch_size), steps_per_epoch=len(train_faces)/batch_size, epochs=num_epochs, verbose=1, callbacks=callbacks, validation_data=val_data)
def make_filenames_list_from_subdir(src_dir, shape, ratio): """ Use names of subdirs as a id. And then calculate class_index from id. """ class_id_set = set() #bottleneck_data = dict() feature_vectors, labels, filenames = [], [], [] image_size = (shape[0], shape[1]) listdir = os.listdir(src_dir) # 1) findout number of classes for class_id in listdir: subdir = src_dir + '/' + class_id if not os.path.isdir(subdir): continue if len(os.listdir(subdir)) == 0: continue else: try: class_id_int = int(class_id) class_id_set.add(class_id_int) except: continue # 2) maps class_id to class_index id_list = list(class_id_set) id_list.sort() print('Number of classes in the sample: {0}'.format(len(id_list))) print('Min class id: {0}'.format(min(id_list))) print('Max class id: {0}'.format(max(id_list))) map_id_label = {class_id: index for index, class_id in enumerate(id_list)} map_label_id = {index: class_id for index, class_id in enumerate(id_list)} maps = {'id_label': map_id_label, 'label_id': map_label_id} num_classes = len(map_id_label) NUM_CLASSES = num_classes save_labels_to_file(settings.LABELS_FILE, map_label_id) # create the file labels.txt for class_id in class_id_set: subdir = src_dir + '/' + str(class_id) print(subdir) files = os.listdir(subdir) num_files = len(files) for index_file, filename in enumerate(files): base = os.path.splitext(filename)[0] ext = os.path.splitext(filename)[1] if not ext in {'.jpg', ".png"}: continue # ???? #if base.split('_')[-1] != '0p': continue # use only _0p.jpg files class_index = map_id_label[class_id] label = class_index #label = [0]*num_classes #label[class_index] = 1 file_path = subdir + '/' + filename #im = Image.open(file_path) #im = im.resize(image_size, Image.ANTIALIAS) #arr = np.array(im, dtype=np.float32) / 256 #feature_vector = bottleneck_tensor.eval(feed_dict={ x : [arr] }) #feature_vectors.append(feature_vector) feature_vectors.append(0) # not used filenames.append(file_path) # filename or file_path labels.append(label) #im.close() print("dir={0}, class={1}: {2}/{3}: {4}".format( class_id, class_index, index_file, num_files, filename)) print('----') print('Number of classes: {0}'.format(num_classes)) print('Number of feature vectors: {0}'.format(len(feature_vectors))) data = { 'images': feature_vectors, 'labels': labels, 'filenames': filenames } # mix data if settings.DO_MIX: print('start mix data') zip3 = list(zip(data['images'], data['labels'], data['filenames'])) random.shuffle(zip3) print('mix ok') data['images'] = [x[0] for x in zip3] data['labels'] = [x[1] for x in zip3] data['filenames'] = [x[2] for x in zip3] print('Split data') #data = split_data.split_data_v3(data, ratio=ratio) data = split_data(data, ratio=ratio, do_balancing=settings.DO_BALANCING) assert type(data['train']['labels'][0]) is int assert type(data['train']['filenames'][0]) is str #print(data['train']['labels']) #print(data['train']['filenames']) print('TRAIN') for i in range(len(data['train']['labels'])): print('{0} - {1}'.format(data['train']['labels'][i], data['train']['filenames'][i])) print('VALID') for i in range(len(data['valid']['labels'])): print('{0} - {1}'.format(data['valid']['labels'][i], data['valid']['filenames'][i])) data['id_label'] = map_id_label data['label_id'] = map_label_id data['num_classes'] = num_classes return data
def split_variation(X=None, y=None, step=0.05): """ Function used to split the data into different ratios for the training and the testing data where, the training data varies in "step" intervals as specified and the LR classifier is trained with the given splits and then we obtain the respective accuracy and F1-Score""" if X is None or y is None: print("Data not provided.") return split = 0.00 splits = [] accuracies = {"train": [], "test": []} f1_scores = {"train": [], "test": []} for split in np.arange(step, 1.0, step): splits.append(split * 100) X_train, X_test, y_train, y_test = split_data(X, y, split) classifier = Classifier(model="logistic") classifier.train(X_train, y_train) classifier.validate(X_train, y_train) train_accuracy = classifier.model_accuracy() train_score = classifier.model_score() classifier.validate(X_test, y_test) test_accuracy = classifier.model_accuracy() test_score = classifier.model_score() accuracies["train"].append(train_accuracy) accuracies["test"].append(test_accuracy) f1_scores["train"].append(train_score) f1_scores["test"].append(test_score) splits = np.array(splits) training = { "accuracy": np.array(accuracies["train"]), "score": np.array(f1_scores["train"]), } testing = { "accuracy": np.array(accuracies["test"]), "score": np.array(f1_scores["test"]), } table = pd.DataFrame({ "Train size (%)": splits, "Training Accuracy": training["accuracy"], "Training F1-Score": training["score"], "Testing Accuracy": testing["accuracy"], "Testing F1-Score": testing["score"], }) table.style.set_caption("Variation of performance with train-test split") display(table) plt.figure(figsize=(15, 10)) plt.plot(splits, training["accuracy"], c="blue", label="Training Accuracy") plt.plot(splits, testing["accuracy"], c="green", label="Validation Accuracy") plt.ylabel("Accuracy") plt.xlabel("Training Set Size (%)") plt.title("Training Set Size vs Accuracy") plt.legend() plt.show()
'./car.data', 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data') df.columns = [ 'buying', 'maintenance', 'doors', 'people', 'lug_boot', 'safety', 'class' ] # CONVERT STRING VALUES TO THEIR NUMERICAL COUNTERPARTS (FASTER CALCULATION) convert_to_numerical(df, columns=[ 'buying', 'maintenance', 'doors', 'people', 'lug_boot', 'safety', 'class' ], inplace=True) # SPLIT DATASET INTO TRAINING, VALIDATION, TESTING training, validation, test = split_data(df, inplace=True) # CREATE CLASSIFIER AND FIT IT TO TRAINING DATA training_X = training.iloc[:, :-1] training_y = training.iloc[:, -1] my_clf = DecisionTree(metric='gini') my_clf.fit(training_X, training_y) sklearn_clf = DecisionTreeClassifier() sklearn_clf.fit(training_X, training_y) # SPLIT VALIDATION DATASETS INTO X AND y validation_X = validation.iloc[:, :-1] validation_y = validation.iloc[:, -1] # PRINT METRICS FOR PRUNNED AND UNPRUNNED DECISION TREE my_predictions = my_clf.predict(validation_X)
def main(_): with tf.device('/gpu:0'): for regularization_type in ['Blackout', 'None', 'L1', 'L2']: dataset_sizes = np.linspace(2500, 55000, num=22) for size in dataset_sizes: # Getting the appropriate dataset print(int(size)) train_x, train_y, valid_x, valid_y, test_x, test_y = split_data( dataset, int(size)) # Resetting the graph incase of multiple runs on the same console tf.reset_default_graph() for i in range(numOfTests): num_layers = random.choice([5, 6, 7, 8, 9, 10]) num_nodes = random.choice([200, 400, 600]) num_inputs = int(train_x.shape[1]) num_steps = random.choice([50, 100, 150, 200]) regularization_scale = random.choice( [0.01, 0.005, 0.001, 0.0005]) percent_connections_kept = random.choice([0.9, 0.95, 0.85]) num_classes = len(np.unique(train_y)) print('Test No. ' + str(i) + '/' + str(numOfTests)) print('Parameters: ' + str(size) + ',' + regularization_type + ',' + str(num_layers) + ',' + str(num_nodes) + ',' + str(num_steps) + ',' + str(regularization_scale) + ',' + str(percent_connections_kept)) # Create the model x = tf.placeholder(tf.float32, [None, num_inputs]) y = create_model(x, num_layers, num_nodes, num_classes) # Define loss and optimizer y_ = tf.placeholder(tf.int64, [None]) # Retrieving weights and defining regularization penalty weights = tf.trainable_variables() regularization_penalty, blackout_weights = get_regularization_penalty( weights, regularization_scale, percent_connections_kept, regularization_type) # Defining loss and optimizer cross = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) loss = cross + regularization_penalty train_step = tf.train.RMSPropOptimizer(0.001).minimize( loss) # Evaluate Model correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) config = tf.ConfigProto() config.gpu_options.allow_growth = True # Initializing session sess = tf.InteractiveSession(config=config) tf.global_variables_initializer().run() # Train # PercentageOfConnOff=[] # LossFunctionRegu=[] # LossFunctionCrossTrain=[] # LossFunctionCrossValid=[] # numOfBatches = 50 all_batches_x, all_batches_y = get_batches( train_x, train_y, numOfBatches) # Train for i in range(num_steps): randomPick = random.randint(0, numOfBatches) #print(str(len(all_batches_x)) + " getting " + str(randomPick)) if randomPick == 50: randomPick = 49 currentBatchX = all_batches_x[randomPick] currentBatchY = all_batches_y[randomPick] sess.run(train_step, feed_dict={ x: currentBatchX, y_: currentBatchY }) # Test trained model if i % 20 == 1: print('Accuracy: ' + str( sess.run(accuracy, feed_dict={ x: valid_x, y_: valid_y }))) # if regularization_type=='Blackout': # currentWeights=sess.run(blackout_weights) # part1=currentWeights>-0.01 # part2=currentWeights<0.01 # turnedOff=np.sum(np.logical_and(part1,part2)) # TotalNumOfWeights=float(currentWeights.shape[0]) # LossFunctionCrossTrain.append(sess.run(cross, feed_dict={x: train_x, y_: train_y})) # LossFunctionCrossValid.append(sess.run(cross, feed_dict={x: valid_x, y_: valid_y})) # LossFunctionRegu.append(sess.run(regularization_penalty)) # PercentageOfConnOff.append((TotalNumOfWeights-turnedOff)/TotalNumOfWeights) #if regularization_type=='Blackout': # fig = plt.figure() # ax1 = fig.add_subplot(1, 2, 1) # ax2 = fig.add_subplot(1, 2, 2) # ax1.plot(PercentageOfConnOff) # ax2.plot(LossFunctionCrossTrain,label='Cross-Entropy Train') # ax2.plot(LossFunctionCrossValid,label='Cross-Entropy Validation') # ax2.plot(LossFunctionRegu,label='Regularization') # ax2.legend() # fig.show() accuracyVal = sess.run(accuracy, feed_dict={ x: valid_x, y_: valid_y }) accuracyTest = sess.run(accuracy, feed_dict={ x: test_x, y_: test_y }) tf.reset_default_graph() store_results(dataset, regularization_type, num_layers, num_nodes, num_steps, regularization_scale, percent_connections_kept, accuracyVal, accuracyTest, size) print('Accuracy Val: ' + str(accuracyVal) + ' , Accuracy Test: ' + str(accuracyTest))
tensorflow>=1.5.0 """ import numpy as np from sklearn.metrics.classification import accuracy_score from sklearn import preprocessing from dbn import SupervisedDBNClassification import data_processing # load data original_train_set, original_test_set = data_processing.split_data( npy_data_file='../data/all_train.npy', train_portion=0.01, split_mode='first', save='npy') #original_train_set = np.load('../data/all_train.npy') #original_test_set = np.load('../data/test_set.npy') # fill in missing values train_set = data_processing.missing_values(original_train_set, method='median') test_set = data_processing.missing_values(original_test_set, method='median') # get X and y X_train = train_set[1:, 1:-1] #X_scaled_train = preprocessing.scale(X_train) min_max_scaler = preprocessing.MinMaxScaler() X_scaled_train = min_max_scaler.fit_transform(X_train) y_train = train_set[1:, -1]