def run(spath_train, tpath_train, spath_test, tpath_test, fn_train, fn_predict_all, max_sentence_length=17, replace_unknown_words=True, use_bpe=True, num_operations=400, vocab_threshold=5, padding=True, model_name='nn'): # data preprocessing (spath_train_pp, tpath_train_pp, spath_test_pp, tpath_test_pp) = preprocess(spath_train, tpath_train, spath_test, tpath_test, max_sentence_length, replace_unknown_words, use_bpe, num_operations, vocab_threshold) print(f'Data files preprocessed ...') print() # data structures for training (slang, tlang, index_array_pairs, s_index_arrays_test, max_bpe_length) = dp.prepare_data(spath_train_pp, tpath_train_pp, spath_test_pp, padding) print(f'{len(index_array_pairs)} inputs constructed for training ...') print() # train and return losses for plotting (encoder, attn_decoder, plot_losses, plot_every) = fn_train(index_array_pairs, slang.n_words, tlang.n_words, max_bpe_length) print(f'Training finished ...') print() # plot the losses showLosses(plot_losses, plot_every, f'../output/{model_name}_losses.png') print(f'Losses diagram saved in TODO') persistence.save(plot_losses, fp.path_to_outputfile(f'{model_name}.tl', '.trainloss')) # save models and data torch.save(encoder, f'../output/{model_name}_encoder.pt') torch.save(attn_decoder, f'../output/{model_name}_attn_decoder.pt') data = (s_index_arrays_test, slang, tlang, max_bpe_length) persistence.save(data, f'../output/{model_name}_data_run') print(f'Models and data saved to disk') print() _evaluate(s_index_arrays_test, tpath_test_pp, slang, tlang, encoder, attn_decoder, fn_predict_all, max_bpe_length, use_bpe, model_name) return encoder, attn_decoder, slang, tlang, plot_losses, max_bpe_length
def randomForestBagging(fileNames): trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0) testY = testY.to_numpy().astype(int) predictions = [] for file in fileNames: # model = dill.load(open(file,"rb")) with open(file, 'rb') as f: rf = dill.load(f) predictions.append(rf.predict(testX)) pred = np.zeros(len(predictions[0])) for i in range(len(predictions[0])): for set in range(len(fileNames)): pred[i] += predictions[set][i] print(pred[i]) pred[i] = (pred[i]/len(fileNames)).round() pred = pred.astype(int) fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1) auc = metrics.auc(fpr, tpr) conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred) print("Confusion Matrix: ") print(str(conf_matrix)) print("Average Accuracy: {}".format(str(best_accuracy))) print("Per-Class Precision: {}".format(str(precision_array))) print("Per-Class Recall: {}".format(str(recall_array))) print("Area under the ROC Curve: {}".format(auc))
def log_optimal_hmms_for_users_single_variate(data, users, cov_type): optimal_hmms_single_variate = {} subfactor_activities = dp.get_dict_ges_activities() for user in users: dict_activity = {} for subfactor, activities in subfactor_activities.items(): for activity in activities: prepared_data = dp.prepare_data(data, user, [activity]) log = optimize_number_of_clusters(prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type) return log
def get_optimal_hmms_for_users_single_variate(data, users, cov_type): optimal_hmms_single_variate = {} subfactor_activities = dp.get_dict_ges_activities() for user in users: dict_activity = {} for subfactor, activities in subfactor_activities.items(): for activity in activities: prepared_data = dp.prepare_data(data, user, [activity]) best_value, best_model = optimize_number_of_clusters( prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type) dict_activity.update({activity: best_model}) dict_user = {user: dict_activity} optimal_hmms_single_variate.update(dict_user) return optimal_hmms_single_variate
def log_activity_results(data, users, range_of_clusters, cov_type, single_multi): ''' :param data: prepared data (values of activities by columns) :param range_of_clusters: range of best number expected e.g. 2:10 :return: Optimizes number of clusters for single citizen This is helper method for get_optimal_hmms methods (they work for more citizens) ''' import pickle log_results = [] subfactor_activities = dp.get_dict_ges_activities() for user in users: for subfactor, activities in subfactor_activities.items(): for activity in activities: prepared_data = dp.prepare_data(data, user, [activity]) #log = optimize_number_of_clusters(prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type) # pivoted_data = prepare_data(data, user, [ac]) old version for data preparation for n_states in range_of_clusters: model = GaussianHMM(n_components=n_states, covariance_type=cov_type, n_iter=1000).fit( prepared_data.iloc[:, 1:]) log_likelihood = model.score(data) criteria_bic = bic_criteria(data, log_likelihood, model) criteria_aic = aic_criteria(data, log_likelihood, model) aic_bic_dict = { 'user': user, 'activity': activity, 'n_states': n_states, 'BIC': criteria_bic, 'AIC': criteria_aic } log_results.append(aic_bic_dict) if single_multi == 'single': path = 'Experimental_Evaluation/Models/user_' + user + 'activity_' + activity + '_n_states_' + n_states + '.pkl' if single_multi == 'multi': path = 'Experimental_Evaluation/Models/user_' + user + 'sub_factor_' + activity + '_n_states_' + n_states + '.pkl' pickle.dump(model, path) if single_multi == 'single': log_path = 'Experimental_Evaluation/single_variate_log.csv' if single_multi == 'multi': log_path = 'Experimental_Evaluation/multi_variate_log.csv' log = pd.DataFrame(log_results) log.to_csv(log_results, log_path) return log
def get_optimal_hmms_for_users_multi_variate(data, users, cov_type): optimal_hmms_multi_variate = {} subfactor_activities = dp.get_dict_ges_activities() for user in users: dict_subfactor = {} for subfactor in subfactor_activities.keys(): activities = subfactor_activities[subfactor] prepared_data = dp.prepare_data(data, user, activities) best_value, best_model = optimize_number_of_clusters( prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type) dict_subfactor.update( {subfactor: { 'model': best_model, 'activities': activities }}) dict_user = {user: dict_subfactor} optimal_hmms_multi_variate.update(dict_user) return optimal_hmms_multi_variate
def predict_multi_variate(data, users_ges_activities): ''' :param data: :param users_ges_activities: :return: ''' for user, ges_activities in users_ges_activities.items(): for ges, activities in ges_activities.items(): model=pickle_hmm.load_pickle_hmm_multi_variate(user, ges) prep_data=dp.prepare_data(data, user, activities) clusters=model.predict(prep_data.iloc[:,2:]) probas=model.predict_proba(prep_data.iloc[:,2:]) probas_np=np.array(probas) max_probas=np.amax(probas_np,1) prep_data['cluster']=clusters prep_data['max_probability']=max_probas #a=pd.melt(prep_data, id_vars=['user_in_role_id', 'interval_end','cluster', 'max_probability'], value_vars=activity) prep_data.to_csv('Data/clustered_data/multi_variate_clusters/citizen_id_'+str(user)+'_'+ges+'.csv') return 0
def predict_single_variate(users_activities): ''' :param users_activities: :return: ''' df_predictions=pd.DataFrame() for user, activities in users_activities.items(): for activity in activities: model=pickle_hmm.load_pickle_hmm_single_variate(user, activity) prep_data=dp.prepare_data(data, user, [activity]) clusters=model.predict(prep_data.iloc[:,2:]) probas=model.predict_proba(prep_data.iloc[:,2:]) probas_np=np.array(probas) max_probas=np.amax(probas_np,1) prep_data['cluster']=clusters prep_data['max_probability']=max_probas a=pd.melt(prep_data, id_vars=['user_in_role_id', 'interval_end','cluster', 'max_probability'], value_vars=activity) df_predictions=df_predictions.append(a) return df_predictions
def main(): df = data_preparation.prepare_data(["Data/players_15.csv", "Data/players_16.csv", "Data/players_17.csv", "Data/players_18.csv", "Data/players_19.csv", "Data/players_20.csv", "Data/players_21.csv"]) seed = 10 # Run decision tree model on the max depths specified in the list and then show the accuracy chart and confusion matrix of the most accurate results max_depth_list = [5, 7, 9, 10, 12, 14] dt_accuracy_list, dt_confusion_matrices = decision_tree.decision_tree(df, seed, max_depth_list=max_depth_list) plot_bar_accuracy("Decision Tree Accuracy", "Depth", "Accuracy", dt_accuracy_list, max_depth_list) for confusion_matrix in dt_confusion_matrices: plot_cm(confusion_matrix, "Decision Tree Confusion Matrix") # Run random forest on the following list of trees specified and show accuracy chart and confusion matrix of best random forest num_of_trees_list = [10, 15, 20, 25, 30, 35] rf_accuracy_list, rf_confusion_matrices = random_forest.random_forest(df, seed, num_of_trees_list=num_of_trees_list) plot_bar_accuracy("Random Forest Accuracy", "Trees", "Accuracy", rf_accuracy_list, num_of_trees_list) for confusion_matrix in rf_confusion_matrices: plot_cm(confusion_matrix, "Random Forest Confusion Matrix") # For naive bayes only run it once and show the accuracy chart (with only 1 value) and confusion matrix nb_accuracy, nb_confusion_matrix = naive_bayes.naive_bayes(df, seed) plot_bar_accuracy("Naive Bayes Accuracy", "Smoothing", "Accuracy", [0, 0, nb_accuracy, 0, 0], ["", "", 1.0, "", ""]) plot_cm(nb_confusion_matrix, "Naïve Bayes Confusion Matrix") # Run kNN on specified neighbors values and show accuracy chart and confusion matrix of most accurate kNN neighbors_list = [10, 15, 25, 150, 210, 250, 300] kNN_accuracy_list, kNN_confusion_matrices = kNN.k_nearest_neighbors(df, seed, neighbors_list=neighbors_list) plot_bar_accuracy("K Nearest Neighbors Accuracy", "Neighbors", "Accuracy", kNN_accuracy_list, neighbors_list) for confusion_matrix in kNN_confusion_matrices: plot_cm(confusion_matrix, "kNN Confusion Matrix") # Show the chart comparing the best accuracy from each model best_accuracy_list = [max(dt_accuracy_list), max(rf_accuracy_list), nb_accuracy, max(kNN_accuracy_list)] plot_bar_accuracy("Model Accuracy Comparison", "Models", "Accuracy", best_accuracy_list, ["Decision Tree", "Random Forest", "Naive Bayes", "kNN"]) print("")
def buildModels(): trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0) trainY = trainY.to_numpy().astype(int) testY = testY.to_numpy().astype(int) accuracy = [] pred = [] highestTrueNeg = 98 # set to previous highest highestAcc = .707 # set to previous highest for estimators in range(20, 1000, 10): rf = RandomForestRegressor(n_estimators=estimators) rf.fit(trainX, trainY) predictions = rf.predict(testX).round().astype(int) accuracy.append(metrics.accuracy_score(testY, predictions)) pred.append(predictions) if metrics.accuracy_score(testY, predictions) > .69: # manually consider models with better than 69% accuracy conf_matrix, class_acc, recall_array, precision_array = func_confusion_matrix(testY, predictions) if conf_matrix[0,0] > highestTrueNeg: tn = open("randomForestTrueNeg.obj", "wb") dill.dump(rf,tn) highestTrueNeg = conf_matrix[0,0] tn.close() elif metrics.accuracy_score(testY, predictions) > highestAcc: acc = open("randomForestAccuracy.obj", "wb") dill.dump(rf, acc) acc.close() highestAcc = class_acc index, value = max(enumerate(accuracy), key=operator.itemgetter(1)) print("Best Number of Estimators: {}".format(20 + 10*(index))) # Use the forest's predict method on the test data conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred[index]) print("Confusion Matrix: ") print(str(conf_matrix)) print("Average Accuracy: {}".format(str(best_accuracy))) print("Per-Class Precision: {}".format(str(precision_array))) print("Per-Class Recall: {}".format(str(recall_array)))
def get_data() -> pd.DataFrame: return dp.prepare_data()
def train_model(self, data, model_path, plot='plot.png', lr=1e-3, height=32, width=32, batch_size=32, epochs=50): from data_preparation import prepare_data import matplotlib.pyplot as plt model = self.build_model() opt = Adam(lr=lr) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=[metrics.mae, metrics.categorical_accuracy]) # Split training/validation 80/20 # https://faroit.github.io/keras-docs/2.0.8/preprocessing/image/ prepare_data(data_location=data, train_data_path=data+'/train/', test_data_path=data+'/test/') datagen = ImageDataGenerator(#validation_split=0.2, rescale=1. / 255, rotation_range=20., width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True ) train_gen = datagen.flow_from_directory( data+'/train/', classes=CLASSES, target_size=(height, width), batch_size=batch_size ) val_gen = datagen.flow_from_directory( data+'/test/', classes=CLASSES, target_size=(height, width) ) early_stopping = EarlyStopping(monitor='val_loss', patience=5) info = model.fit_generator(train_gen, steps_per_epoch=3, validation_steps=2, validation_data=val_gen, epochs=epochs, callbacks=[early_stopping] ) model.save(os.path.join(model_path, 'model.h5')) # plot the training loss and accuracy plt.style.use("ggplot") plt.figure() num = len(info.history["loss"]) plt.plot(np.arange(0, num), info.history["loss"], label="train_loss") plt.plot(np.arange(0, num), info.history["val_loss"], label="val_loss") plt.plot(np.arange(0, num), info.history["categorical_accuracy"], label="train_acc") plt.plot(np.arange(0, num), info.history["val_categorical_accuracy"], label="val_acc") plt.plot(np.arange(0, num), info.history["mean_absolute_error"], label="train_mae") plt.plot(np.arange(0, num), info.history["val_mean_absolute_error"], label="val_mae") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="upper right") plt.savefig(plot)
import random from math import ceil from data_preparation import prepare_data prepare_data() """ for unicode in range(12353, 12436): create_image(unicode, georgia_bold) """
def nested_cv(params): # The number of validation split cv_num = params['cv_num'] # The number of test split cv_num_test = params['cv_num_test'] # Data and labels data, hc_ad, site_id_, split_ref_, ref = prepare_data(params['source_dir']) # Numpy array for ML models' performance indices measurements_transfer = np.zeros(shape=[cv_num_test, 5]) measurements_classifier = np.zeros(shape=[cv_num_test, 5]) measurements_svm = np.zeros(shape=[cv_num_test, 5]) measurements_rf = np.zeros(shape=[cv_num_test, 5]) # List of accuracy for final output accs_transfer = [] accs_classifier = [] accs_svm = [] accs_rf = [] skf_ = StratifiedKFold(n_splits=cv_num_test, random_state=0, shuffle=True) # Split keeping the ratio of acquisition sites and ASD/TC simultaneously for cv_iteration_, (train_index_, test_index) in enumerate(skf_.split(data, split_ref_)): # List of standard scalers for each validation split scalers = [] # List of trained models trained_transfer = [] trained_classifier = [] epochs = [] # Split data x_train_, x_test = data[train_index_], data[test_index] labels_train_, labels_test = hc_ad[train_index_], hc_ad[test_index] split_ref = split_ref_[train_index_] skf = StratifiedKFold(n_splits=cv_num, random_state=0, shuffle=True) # Split keeping the ratio of acquisition sites and ASD/TC simultaneously for cv_iteration, (train_index, valid_index) in enumerate(skf.split(x_train_, split_ref)): epochs_cv = [] # Split data x_train, x_valid = x_train_[train_index], x_train_[valid_index] labels_train, labels_valid = labels_train_[train_index], labels_train_[valid_index] # Standardize input scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_valid = scaler.transform(x_valid) scalers.append(scaler) # Train EIIC model model_trained_transfer = train_transfer(x_train, labels_train, x_valid, labels_valid, params['nn_params']) trained_transfer.append(model_trained_transfer) # Train simple MLP without contrastive learning model_trained_classifier = train_classifier(x_train, labels_train, x_valid, labels_valid, params['nn_params']) trained_classifier.append(model_trained_classifier) # Calculate confusion matrix from trained model matrix = matrix_from_models(trained_transfer, scalers, x_test, labels_test, params['nn_params']['num_classes'], params['nn_params']['device']) # Calculate performance indices from confusion matrix acc, recall, specificity, ppv, npv = model_measurements(matrix) accs_transfer.append(acc) measurements_transfer[cv_iteration_,:] = [acc, recall, specificity, ppv, npv] matrix = matrix_from_models_cl(trained_classifier, scalers, x_test, labels_test, params['nn_params']['num_classes'], params['nn_params']['device']) acc, recall, specificity, ppv, npv = model_measurements(matrix) accs_classifier.append(acc) measurements_classifier[cv_iteration_,:] = [acc, recall, specificity, ppv, npv] # PCA dimensionality reduction for SVM and RF pca = PCA(n_components = params['nn_params']['prefinal_num']) # Get the transformation of PCA from data excepting test data pca.fit(x_train_) # Transform data x_train_sc = pca.transform(x_train_) x_test_sc = pca.transform(x_test) # Train SVM and RF svm = train_svm(x_train_sc, labels_train_, skf.split(x_train_, split_ref), params['tuning_params_svm']) rf = train_rf(x_train_sc, labels_train_, skf.split(x_train_, split_ref), params['tuning_params_rf']) # Calculate confusion matrix and performance indices matrix = prediction_matrix(x_test_sc, labels_test, svm, params['nn_params']['num_classes']) acc, recall, specificity, ppv, npv = model_measurements(matrix) accs_svm.append(acc) measurements_svm[cv_iteration_,:] = [acc, recall, specificity, ppv, npv] matrix = prediction_matrix(x_test_sc, labels_test, rf, params['nn_params']['num_classes']) acc, recall, specificity, ppv, npv = model_measurements(matrix) accs_rf.append(acc) measurements_rf[cv_iteration_,:] = [acc, recall, specificity, ppv, npv] # Save models if params['save_FLAG']: models = { 'nn_scalers': scalers, 'pca': pca, 'transfer_models': trained_transfer, 'classifier_models': trained_classifier, 'svm_model': svm, 'rf_model': rf } save_models(models, params['output_dir'], str(cv_iteration_)) # Output the performance indices as xlsx if params['save_FLAG']: models = { 'transfer_models': measurements_transfer, 'classifier_models': measurements_classifier, 'svm_model': measurements_svm, 'rf_model': measurements_rf } save_result_csv(models, ref, ['acc', 'recall', 'specificity', 'ppv', 'npv'], params['output_dir']) # Return accuracies for output return accs_transfer, accs_classifier, accs_svm, accs_rf
def main(): df, model, X_train, y_train, sc = data_preparation.prepare_data() model = train.train(model, X_train, y_train) test.test(df, model, sc) bm.close()
def main(task_config, n=21, k=2, device=0, d=100, epochs=100): # Global parameters debug_mode = True verbose = True save = True freeze_word_embeddings = True over_population_threshold = 100 relative_over_population = True data_augmentation = True if debug_mode: data_augmentation = False over_population_threshold = None logging.info("Task name: {}".format(task_config['name'])) logging.info("Debug mode: {}".format(debug_mode)) logging.info("Verbose: {}".format(verbose)) logging.info("Freeze word embeddings: {}".format(freeze_word_embeddings)) logging.info( "Over population threshold: {}".format(over_population_threshold)) logging.info( "Relative over population: {}".format(relative_over_population)) logging.info("Data augmentation: {}".format(data_augmentation)) use_gpu = torch.cuda.is_available() # use_gpu = False if use_gpu: cuda_device = device torch.cuda.set_device(cuda_device) logging.info('Using GPU') # Load dataset dataset = task_config['dataset'](debug_mode, relative_path='./data/') all_sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences word_embeddings = load_embeddings( './data/glove_embeddings/glove.6B.{}d.txt'.format(d)) chars_embeddings = load_embeddings( './predicted_char_embeddings/char_mimick_glove_d100_c20') # Prepare vectorizer word_to_idx, char_to_idx = make_vocab(all_sentences) vectorizer = WordsInContextVectorizer(word_to_idx, char_to_idx) vectorizer = vectorizer # Initialize training parameters model_name = '{}_n{}_k{}_d{}_e{}'.format(task_config['name'], n, k, d, epochs) lr = 0.001 if debug_mode: model_name = 'testing_' + model_name save = False epochs = 3 # Create the model net = LRComick( characters_vocabulary=char_to_idx, words_vocabulary=word_to_idx, characters_embedding_dimension=20, # characters_embeddings=chars_embeddings, word_embeddings_dimension=d, words_embeddings=word_embeddings, # context_dropout_p=0.5, # fc_dropout_p=0.5, freeze_word_embeddings=freeze_word_embeddings) model_name = "{}_{}_v{}".format(model_name, net.__class__.__name__.lower(), net.version) handler = logging.FileHandler('{}.log'.format(model_name)) logger.addHandler(handler) model = Model( model=net, optimizer=Adam(net.parameters(), lr=lr), loss_function=square_distance, metrics=[cosine_sim], ) if use_gpu: model.cuda() # Prepare examples train_loader, valid_loader, test_loader, oov_loader = prepare_data( dataset=dataset, embeddings=word_embeddings, vectorizer=vectorizer, n=n, use_gpu=use_gpu, k=k, over_population_threshold=over_population_threshold, relative_over_population=relative_over_population, data_augmentation=data_augmentation, debug_mode=debug_mode, verbose=verbose, ) # Set up the callbacks and train train( model, model_name, train_loader=train_loader, valid_loader=valid_loader, epochs=epochs, ) test_embeddings = evaluate(model, test_loader=test_loader, test_embeddings=word_embeddings, save=save, model_name=model_name + '.txt') predicted_oov_embeddings = predict_mean_embeddings(model, oov_loader) # Override embeddings with the training ones # Make sure we only have embeddings from the corpus data logging.info("Evaluating embeddings...") predicted_oov_embeddings.update(word_embeddings) for task in task_config['tasks']: logging.info("Using predicted embeddings on {} task...".format( task['name'])) task['script'](predicted_oov_embeddings, task['name'] + "_" + model_name, device, debug_mode) logger.removeHandler(handler)
############# # Author: Caleb Gelnar ############# from sklearn.linear_model import LogisticRegression from conf_matrix import func_confusion_matrix from data_preparation import prepare_data from sklearn import metrics # Prepare training and Test Data by splitting in training data X_Train, X_Test, Y_Train, Y_Test = prepare_data(test_size=0.35, seed=0) model = LogisticRegression(penalty='l1', C=8, fit_intercept=True, solver='liblinear', max_iter=100, l1_ratio=None) model.fit(X_Train, Y_Train) predictions = model.predict(X_Test) conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix( Y_Test, predictions) fpr, tpr, thresholds = metrics.roc_curve(Y_Test, predictions, pos_label=1) auc = metrics.auc(fpr, tpr) print() print("########### MODEL PERFORMANCE ###########") print("Confusion Matrix: ") print(conf_matrix) print("Average Accuracy: {}".format(accuracy)) print("Per-Class Precision: {}".format(precision_array)) print("Per-Class Recall: {}".format(recall_array))
def main(): parser = argparse.ArgumentParser( description="launch a regression pipeline given a dataset") parser.add_argument( "dataset_filename", type=str, help="path to the dataset's .csv file", ) parser.add_argument( "model_name", type=str, choices=[ "linear", "lasso", "ridge", "elastic-net", "backward", "forward", "polynomial", ], help="type of model to use.", ) parser.add_argument( "-f", type=str, default=None, choices=["correlation", "pca"], help="type of feature selection to apply (default=None)", ) parser.add_argument( "-n", type=int, default=2, metavar="", help="number of split for the cross-validation (default=3)", ) args = parser.parse_args() # Load the dataset and clean it dataset_filename = args.dataset_filename dataset_df = load_dataset(dataset_filename) prepare_data(dataset_df) print(f"+ Dataset {dataset_filename} loaded and cleaned " f"({dataset_df.shape[0]} samples)") # Select the features feature_selection = args.f if feature_selection: if feature_selection == "correlation": dataset_df = select_correlation_features(dataset_df) if feature_selection == "pca": dataset_df = select_pca_features(dataset_df) print(f"+ Preprocessing {feature_selection} applied on data") # Chose the model to use model_name = args.model_name if model_name == "linear": model = linear_regression() if model_name == "lasso": model = lasso_regression() if model_name == "ridge": model = ridge_regression() if model_name == "elastic-net": model = elastic_net_regression() if model_name == "backward": dataset_df = select_backward_features(dataset_df) model = linear_regression() if model_name == "forward": dataset_df = select_forward_features(dataset_df) model = linear_regression() if model_name == "polynomial": dataset_df = select_polynomial_features(dataset_df) model = linear_regression() print(f"+ Model {model_name} initialized \n") # Perform a cross validation n_splits = args.n X, y_true = get_data_arrays(dataset_df) A = get_predictions_cv(X, y_true, model, n_splits=n_splits) X_train, X_test, Y_train, Y_test, Y_pred = A for i in range(len(X_train)): print(f"[{i + 1}/{n_splits}]: Train set size: {X_train[i].shape[0]} / " f"Test set size: {Y_pred[i].shape[0]}") # Compute cross-validation, median, mean and standard dviation MSE and r2 print("\n" + get_score_cv(Y_pred, Y_test))
from keras.models import load_model from metrics import my_iou_metric from data_preparation import prepare_data from utils import rle_encoding from skimage.util import crop imgs_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Train/images' mask_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Train/masks' test_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Test/images' train = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/train.csv' depth = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/depths.csv' inst = prepare_data(imgs_folder, mask_folder, test_folder, train, depth) test_data = inst.test_data_gen() final_model = load_model('U-resnet_decoding', custom_objects={'my_iou_metric': my_iou_metric}) def predict_results(model, test_data, inst): preds = model.predict(test_data) final_preds = preds[1] final_pred = np.array([ crop(final_preds[i], ((13, 14), (13, 14), (0, 0))).reshape((101, 101)) for i in range(18000) ]) final_p = np.where(final_pred >= 0.5, 1, 0)
if single_multi == 'single': log_path = 'Experimental_Evaluation/single_variate_log.csv' if single_multi == 'multi': log_path = 'Experimental_Evaluation/multi_variate_log.csv' log = pd.DataFrame(log_results) log.to_csv(log_path) return log log_activity_results(data,users, list(range(2, 11)), 'diag', 'single') data.columns prepared_data = dp.prepare_data(data, 66, ['sleep_deep_time']) prepared_data.columns prepared_data.head() def bic_criteria(data, log_likelihood, model): ''' :param data: :param log_likelihood: :param model: :return: ''' n_features = data.shape[1] ### here adapt for multi-variate
# Author: Juan Candelaria Claborne ############# import tensorflow as tf import itertools import sys from conf_matrix import func_confusion_matrix from data_preparation import prepare_data import tensorflow.python.util.deprecation as deprecation from sklearn import metrics deprecation._PRINT_DEPRECATION_WARNINGS = False ############# # Create data trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0) trainY = trainY.to_numpy().astype(int) testY = testY.to_numpy().astype(int) train = tf.data.Dataset.from_tensor_slices((trainX, trainY)) \ .shuffle(len(trainY)) \ .batch(16) ############# # Make neural network tf.keras.backend.set_floatx('float64') modelCount = 0 activation = ['relu', 'sigmoid', 'tanh', 'elu'] hiddenLayers = [3, 4, 5, 6, 7] unitCounts = [4, 8, 12, 20, 28, 32]
from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.metrics import mean_squared_error, make_scorer from sklearn.model_selection import RandomizedSearchCV, train_test_split from sklearn.metrics import accuracy_score import numpy as np import pandas as pd # Loading the prepared data if os.path.exists("data.csv"): data = pd.read_csv("data.csv") else: data = prepare_data() # -------- First Part: Training and evaluating a RF regressor # - 1.1: Performing a grid search for the best parameters of the random forest regressor # - 1.2: Train the regressor on the whole feature set with the best parameters # - 1.3: Evaluate the regressor by calculating the outcome of the game (won home, away, equal result) and compare it with the real result from y_test X = data.drop(['score_home', 'score_away', 'winners'], axis=1, inplace=False) y = data.loc[:, ['score_home', 'score_away', 'winners']] random_forest = RandomForestRegressor(n_jobs=-1) neural_network = MLPRegressor(activation='relu', solver="adam", early_stopping=True)
def main(): try: prepare_data() train_and_test() finally: bm.close()