Beispiel #1
0
def run(spath_train,
        tpath_train,
        spath_test,
        tpath_test,
        fn_train,
        fn_predict_all,
        max_sentence_length=17,
        replace_unknown_words=True,
        use_bpe=True,
        num_operations=400,
        vocab_threshold=5,
        padding=True,
        model_name='nn'):

    # data preprocessing
    (spath_train_pp, tpath_train_pp, spath_test_pp,
     tpath_test_pp) = preprocess(spath_train, tpath_train, spath_test,
                                 tpath_test, max_sentence_length,
                                 replace_unknown_words, use_bpe,
                                 num_operations, vocab_threshold)

    print(f'Data files preprocessed ...')
    print()

    # data structures for training
    (slang, tlang, index_array_pairs, s_index_arrays_test,
     max_bpe_length) = dp.prepare_data(spath_train_pp, tpath_train_pp,
                                       spath_test_pp, padding)

    print(f'{len(index_array_pairs)} inputs constructed for training ...')
    print()

    # train and return losses for plotting
    (encoder, attn_decoder, plot_losses,
     plot_every) = fn_train(index_array_pairs, slang.n_words, tlang.n_words,
                            max_bpe_length)

    print(f'Training finished ...')
    print()

    # plot the losses
    showLosses(plot_losses, plot_every, f'../output/{model_name}_losses.png')
    print(f'Losses diagram saved in TODO')

    persistence.save(plot_losses,
                     fp.path_to_outputfile(f'{model_name}.tl', '.trainloss'))

    # save models and data
    torch.save(encoder, f'../output/{model_name}_encoder.pt')
    torch.save(attn_decoder, f'../output/{model_name}_attn_decoder.pt')
    data = (s_index_arrays_test, slang, tlang, max_bpe_length)
    persistence.save(data, f'../output/{model_name}_data_run')
    print(f'Models and data saved to disk')
    print()

    _evaluate(s_index_arrays_test, tpath_test_pp, slang, tlang, encoder,
              attn_decoder, fn_predict_all, max_bpe_length, use_bpe,
              model_name)

    return encoder, attn_decoder, slang, tlang, plot_losses, max_bpe_length
Beispiel #2
0
def randomForestBagging(fileNames):
    trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0)
    testY = testY.to_numpy().astype(int)
    predictions = []
    for file in fileNames:
        # model = dill.load(open(file,"rb"))
        with open(file, 'rb') as f:
            rf = dill.load(f)
        predictions.append(rf.predict(testX))
    pred = np.zeros(len(predictions[0]))
    for i in range(len(predictions[0])):
        for set in range(len(fileNames)):
            pred[i] += predictions[set][i]
            print(pred[i])
        pred[i] = (pred[i]/len(fileNames)).round()
    pred = pred.astype(int)
    fpr, tpr, thresholds = metrics.roc_curve(testY, pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred)
    print("Confusion Matrix: ")
    print(str(conf_matrix))
    print("Average Accuracy: {}".format(str(best_accuracy)))
    print("Per-Class Precision: {}".format(str(precision_array)))
    print("Per-Class Recall: {}".format(str(recall_array)))
    print("Area under the ROC Curve: {}".format(auc))
def log_optimal_hmms_for_users_single_variate(data, users, cov_type):
    optimal_hmms_single_variate = {}
    subfactor_activities = dp.get_dict_ges_activities()
    for user in users:
        dict_activity = {}
        for subfactor, activities in subfactor_activities.items():
            for activity in activities:
                prepared_data = dp.prepare_data(data, user, [activity])
                log = optimize_number_of_clusters(prepared_data.iloc[:, 2:],
                                                  list(range(2, 11)), cov_type)

    return log
def get_optimal_hmms_for_users_single_variate(data, users, cov_type):
    optimal_hmms_single_variate = {}
    subfactor_activities = dp.get_dict_ges_activities()
    for user in users:
        dict_activity = {}
        for subfactor, activities in subfactor_activities.items():
            for activity in activities:
                prepared_data = dp.prepare_data(data, user, [activity])
                best_value, best_model = optimize_number_of_clusters(
                    prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type)
                dict_activity.update({activity: best_model})
        dict_user = {user: dict_activity}
        optimal_hmms_single_variate.update(dict_user)
    return optimal_hmms_single_variate
def log_activity_results(data, users, range_of_clusters, cov_type,
                         single_multi):
    '''
    :param data: prepared data (values of activities by columns)
    :param range_of_clusters: range of best number expected e.g. 2:10
    :return:
     Optimizes number of clusters for single citizen
     This is helper method for get_optimal_hmms methods (they work for more citizens)
    '''
    import pickle
    log_results = []
    subfactor_activities = dp.get_dict_ges_activities()
    for user in users:
        for subfactor, activities in subfactor_activities.items():
            for activity in activities:
                prepared_data = dp.prepare_data(data, user, [activity])
                #log = optimize_number_of_clusters(prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type)
                # pivoted_data = prepare_data(data, user, [ac]) old version for data preparation
                for n_states in range_of_clusters:
                    model = GaussianHMM(n_components=n_states,
                                        covariance_type=cov_type,
                                        n_iter=1000).fit(
                                            prepared_data.iloc[:, 1:])
                    log_likelihood = model.score(data)
                    criteria_bic = bic_criteria(data, log_likelihood, model)
                    criteria_aic = aic_criteria(data, log_likelihood, model)
                    aic_bic_dict = {
                        'user': user,
                        'activity': activity,
                        'n_states': n_states,
                        'BIC': criteria_bic,
                        'AIC': criteria_aic
                    }
                    log_results.append(aic_bic_dict)
                    if single_multi == 'single':
                        path = 'Experimental_Evaluation/Models/user_' + user + 'activity_' + activity + '_n_states_' + n_states + '.pkl'
                    if single_multi == 'multi':
                        path = 'Experimental_Evaluation/Models/user_' + user + 'sub_factor_' + activity + '_n_states_' + n_states + '.pkl'
                    pickle.dump(model, path)

    if single_multi == 'single':
        log_path = 'Experimental_Evaluation/single_variate_log.csv'
    if single_multi == 'multi':
        log_path = 'Experimental_Evaluation/multi_variate_log.csv'

    log = pd.DataFrame(log_results)
    log.to_csv(log_results, log_path)
    return log
def get_optimal_hmms_for_users_multi_variate(data, users, cov_type):
    optimal_hmms_multi_variate = {}
    subfactor_activities = dp.get_dict_ges_activities()
    for user in users:
        dict_subfactor = {}
        for subfactor in subfactor_activities.keys():
            activities = subfactor_activities[subfactor]
            prepared_data = dp.prepare_data(data, user, activities)
            best_value, best_model = optimize_number_of_clusters(
                prepared_data.iloc[:, 2:], list(range(2, 11)), cov_type)
            dict_subfactor.update(
                {subfactor: {
                    'model': best_model,
                    'activities': activities
                }})
        dict_user = {user: dict_subfactor}
        optimal_hmms_multi_variate.update(dict_user)
    return optimal_hmms_multi_variate
Beispiel #7
0
def predict_multi_variate(data, users_ges_activities):
    '''
    :param data: 
    :param users_ges_activities: 
    :return: 
    '''
    for user, ges_activities in users_ges_activities.items():
        for ges, activities in ges_activities.items():
            model=pickle_hmm.load_pickle_hmm_multi_variate(user, ges)
            prep_data=dp.prepare_data(data, user, activities)
            clusters=model.predict(prep_data.iloc[:,2:])
            probas=model.predict_proba(prep_data.iloc[:,2:])
            probas_np=np.array(probas)
            max_probas=np.amax(probas_np,1)
            prep_data['cluster']=clusters
            prep_data['max_probability']=max_probas
            #a=pd.melt(prep_data, id_vars=['user_in_role_id', 'interval_end','cluster', 'max_probability'], value_vars=activity)
            prep_data.to_csv('Data/clustered_data/multi_variate_clusters/citizen_id_'+str(user)+'_'+ges+'.csv')
    return 0
Beispiel #8
0
def predict_single_variate(users_activities):
    '''
    :param users_activities: 
    :return: 
    '''
    df_predictions=pd.DataFrame()
    for user, activities in users_activities.items():
        for activity in activities:
            model=pickle_hmm.load_pickle_hmm_single_variate(user, activity)
            prep_data=dp.prepare_data(data, user, [activity])
            clusters=model.predict(prep_data.iloc[:,2:])
            probas=model.predict_proba(prep_data.iloc[:,2:])
            probas_np=np.array(probas)
            max_probas=np.amax(probas_np,1)
            prep_data['cluster']=clusters
            prep_data['max_probability']=max_probas
            a=pd.melt(prep_data, id_vars=['user_in_role_id', 'interval_end','cluster', 'max_probability'], value_vars=activity)
            df_predictions=df_predictions.append(a)
    return df_predictions
Beispiel #9
0
def main():
    df = data_preparation.prepare_data(["Data/players_15.csv", "Data/players_16.csv", "Data/players_17.csv",
                                        "Data/players_18.csv", "Data/players_19.csv", "Data/players_20.csv",
                                        "Data/players_21.csv"])

    seed = 10

    # Run decision tree model on the max depths specified in the list and then show the accuracy chart and confusion matrix of the most accurate results
    max_depth_list = [5, 7, 9, 10, 12, 14]
    dt_accuracy_list, dt_confusion_matrices = decision_tree.decision_tree(df, seed, max_depth_list=max_depth_list)
    plot_bar_accuracy("Decision Tree Accuracy", "Depth", "Accuracy", dt_accuracy_list, max_depth_list)
    for confusion_matrix in dt_confusion_matrices:
        plot_cm(confusion_matrix, "Decision Tree Confusion Matrix")

    # Run random forest on the following list of trees specified and show accuracy chart and confusion matrix of best random forest
    num_of_trees_list = [10, 15, 20, 25, 30, 35]
    rf_accuracy_list, rf_confusion_matrices = random_forest.random_forest(df, seed, num_of_trees_list=num_of_trees_list)
    plot_bar_accuracy("Random Forest Accuracy", "Trees", "Accuracy", rf_accuracy_list, num_of_trees_list)
    for confusion_matrix in rf_confusion_matrices:
        plot_cm(confusion_matrix, "Random Forest Confusion Matrix")

    # For naive bayes only run it once and show the accuracy chart (with only 1 value) and confusion matrix
    nb_accuracy, nb_confusion_matrix = naive_bayes.naive_bayes(df, seed)
    plot_bar_accuracy("Naive Bayes Accuracy", "Smoothing", "Accuracy", [0, 0, nb_accuracy, 0, 0], ["", "", 1.0, "", ""])
    plot_cm(nb_confusion_matrix, "Naïve Bayes Confusion Matrix")

    # Run kNN on specified neighbors values and show accuracy chart and confusion matrix of most accurate kNN
    neighbors_list = [10, 15, 25, 150, 210, 250, 300]
    kNN_accuracy_list, kNN_confusion_matrices = kNN.k_nearest_neighbors(df, seed, neighbors_list=neighbors_list)
    plot_bar_accuracy("K Nearest Neighbors Accuracy", "Neighbors", "Accuracy", kNN_accuracy_list, neighbors_list)
    for confusion_matrix in kNN_confusion_matrices:
        plot_cm(confusion_matrix, "kNN Confusion Matrix")

    # Show the chart comparing the best accuracy from each model
    best_accuracy_list = [max(dt_accuracy_list), max(rf_accuracy_list), nb_accuracy, max(kNN_accuracy_list)]
    plot_bar_accuracy("Model Accuracy Comparison", "Models", "Accuracy", best_accuracy_list, ["Decision Tree", "Random Forest", "Naive Bayes", "kNN"])

    print("")
Beispiel #10
0
def buildModels():
    trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0)
    trainY = trainY.to_numpy().astype(int)
    testY = testY.to_numpy().astype(int)

    accuracy = []
    pred = []
    highestTrueNeg = 98  # set to previous highest
    highestAcc = .707  # set to previous highest
    for estimators in range(20, 1000, 10):
        rf = RandomForestRegressor(n_estimators=estimators)
        rf.fit(trainX, trainY)
        predictions = rf.predict(testX).round().astype(int)
        accuracy.append(metrics.accuracy_score(testY, predictions))
        pred.append(predictions)
        if metrics.accuracy_score(testY, predictions) > .69:  # manually consider models with better than 69% accuracy
            conf_matrix, class_acc, recall_array, precision_array = func_confusion_matrix(testY, predictions)
            if conf_matrix[0,0] > highestTrueNeg:
                tn = open("randomForestTrueNeg.obj", "wb")
                dill.dump(rf,tn)
                highestTrueNeg = conf_matrix[0,0]
                tn.close()
            elif metrics.accuracy_score(testY, predictions) > highestAcc:
                acc = open("randomForestAccuracy.obj", "wb")
                dill.dump(rf, acc)
                acc.close()
                highestAcc = class_acc

    index, value = max(enumerate(accuracy), key=operator.itemgetter(1))

    print("Best Number of Estimators: {}".format(20 + 10*(index)))
    # Use the forest's predict method on the test data
    conf_matrix, best_accuracy, recall_array, precision_array = func_confusion_matrix(testY, pred[index])
    print("Confusion Matrix: ")
    print(str(conf_matrix))
    print("Average Accuracy: {}".format(str(best_accuracy)))
    print("Per-Class Precision: {}".format(str(precision_array)))
    print("Per-Class Recall: {}".format(str(recall_array)))
def get_data() -> pd.DataFrame:
    return dp.prepare_data()
    def train_model(self, data, model_path, plot='plot.png',
                    lr=1e-3, height=32, width=32, batch_size=32, epochs=50):
        from data_preparation import prepare_data
        import matplotlib.pyplot as plt

        model = self.build_model()
        opt = Adam(lr=lr)
        model.compile(loss="categorical_crossentropy", optimizer=opt,
                      metrics=[metrics.mae, metrics.categorical_accuracy])

        # Split training/validation 80/20
        # https://faroit.github.io/keras-docs/2.0.8/preprocessing/image/
        prepare_data(data_location=data,
                     train_data_path=data+'/train/',
                     test_data_path=data+'/test/')

        datagen = ImageDataGenerator(#validation_split=0.2,
                                     rescale=1. / 255,
                                     rotation_range=20.,
                                     width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     shear_range=0.2,
                                     zoom_range=0.2,
                                     horizontal_flip=True
                                     )
        train_gen = datagen.flow_from_directory(
            data+'/train/',
            classes=CLASSES,
            target_size=(height, width),
            batch_size=batch_size
        )

        val_gen = datagen.flow_from_directory(
            data+'/test/',
            classes=CLASSES,
            target_size=(height, width)
        )

        early_stopping = EarlyStopping(monitor='val_loss', patience=5)
        info = model.fit_generator(train_gen,
                                   steps_per_epoch=3,
                                   validation_steps=2,
                                   validation_data=val_gen,
                                   epochs=epochs,
                                   callbacks=[early_stopping]
                                   )

        model.save(os.path.join(model_path, 'model.h5'))

        # plot the training loss and accuracy
        plt.style.use("ggplot")
        plt.figure()
        num = len(info.history["loss"])
        plt.plot(np.arange(0, num), info.history["loss"], label="train_loss")
        plt.plot(np.arange(0, num), info.history["val_loss"], label="val_loss")
        plt.plot(np.arange(0, num), info.history["categorical_accuracy"], label="train_acc")
        plt.plot(np.arange(0, num), info.history["val_categorical_accuracy"], label="val_acc")
        plt.plot(np.arange(0, num), info.history["mean_absolute_error"], label="train_mae")
        plt.plot(np.arange(0, num), info.history["val_mean_absolute_error"], label="val_mae")
        plt.title("Training Loss and Accuracy")
        plt.xlabel("Epoch #")
        plt.ylabel("Loss/Accuracy")
        plt.legend(loc="upper right")
        plt.savefig(plot)
Beispiel #13
0
import random
from math import ceil

from data_preparation import prepare_data

prepare_data()
"""
for unicode in range(12353, 12436):
    create_image(unicode, georgia_bold)
"""
Beispiel #14
0
def nested_cv(params):
    
    # The number of validation split
    cv_num = params['cv_num']
    # The number of test split
    cv_num_test = params['cv_num_test']
    # Data and labels
    data, hc_ad, site_id_, split_ref_, ref = prepare_data(params['source_dir'])
    
    # Numpy array for ML models' performance indices
    measurements_transfer = np.zeros(shape=[cv_num_test, 5])
    measurements_classifier = np.zeros(shape=[cv_num_test, 5])
    measurements_svm = np.zeros(shape=[cv_num_test, 5])
    measurements_rf = np.zeros(shape=[cv_num_test, 5])

    # List of accuracy for final output
    accs_transfer = []
    accs_classifier = []
    accs_svm = []
    accs_rf = []

    skf_ = StratifiedKFold(n_splits=cv_num_test, random_state=0, shuffle=True)

    # Split keeping the ratio of acquisition sites and ASD/TC simultaneously
    for cv_iteration_, (train_index_, test_index) in enumerate(skf_.split(data, split_ref_)):

        # List of standard scalers for each validation split
        scalers = []

        # List of trained models
        trained_transfer = []
        trained_classifier = []
        epochs = []

        # Split data
        x_train_, x_test = data[train_index_], data[test_index]
        labels_train_, labels_test = hc_ad[train_index_], hc_ad[test_index]
        split_ref = split_ref_[train_index_]
        
        skf = StratifiedKFold(n_splits=cv_num, random_state=0, shuffle=True)
        
        # Split keeping the ratio of acquisition sites and ASD/TC simultaneously
        for cv_iteration, (train_index, valid_index) in enumerate(skf.split(x_train_, split_ref)):
            epochs_cv = []
            # Split data
            x_train, x_valid = x_train_[train_index], x_train_[valid_index]
            labels_train, labels_valid = labels_train_[train_index], labels_train_[valid_index]
            
            # Standardize input
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_valid = scaler.transform(x_valid)
            
            scalers.append(scaler)
            
            # Train EIIC model
            model_trained_transfer = train_transfer(x_train, labels_train, x_valid, labels_valid, params['nn_params'])
            trained_transfer.append(model_trained_transfer)
            
            # Train simple MLP without contrastive learning
            model_trained_classifier = train_classifier(x_train, labels_train, x_valid, labels_valid, params['nn_params'])
            trained_classifier.append(model_trained_classifier)
        
        # Calculate confusion matrix from trained model
        matrix = matrix_from_models(trained_transfer, scalers, x_test, labels_test, params['nn_params']['num_classes'], params['nn_params']['device'])
        
        # Calculate performance indices from confusion matrix
        acc, recall, specificity, ppv, npv = model_measurements(matrix)
        accs_transfer.append(acc)
        measurements_transfer[cv_iteration_,:] = [acc, recall, specificity, ppv, npv]
            
        matrix = matrix_from_models_cl(trained_classifier, scalers, x_test, labels_test, params['nn_params']['num_classes'], params['nn_params']['device'])
    
        acc, recall, specificity, ppv, npv = model_measurements(matrix)
        accs_classifier.append(acc)
        measurements_classifier[cv_iteration_,:] = [acc, recall, specificity, ppv, npv]
        
        # PCA dimensionality reduction for SVM and RF
        pca = PCA(n_components = params['nn_params']['prefinal_num'])

        # Get the transformation of PCA from data excepting test data
        pca.fit(x_train_)

        # Transform data
        x_train_sc = pca.transform(x_train_)
        x_test_sc = pca.transform(x_test)
        
        # Train SVM and RF
        svm = train_svm(x_train_sc, labels_train_, skf.split(x_train_, split_ref), params['tuning_params_svm'])
        rf = train_rf(x_train_sc, labels_train_, skf.split(x_train_, split_ref), params['tuning_params_rf'])
        
        # Calculate confusion matrix and performance indices
        matrix = prediction_matrix(x_test_sc, labels_test, svm, params['nn_params']['num_classes'])
        acc, recall, specificity, ppv, npv = model_measurements(matrix)
        accs_svm.append(acc)
        measurements_svm[cv_iteration_,:] = [acc, recall, specificity, ppv, npv]
        
        matrix = prediction_matrix(x_test_sc, labels_test, rf, params['nn_params']['num_classes'])
        acc, recall, specificity, ppv, npv = model_measurements(matrix)
        accs_rf.append(acc)
        measurements_rf[cv_iteration_,:] = [acc, recall, specificity, ppv, npv]
        
        # Save models
        if params['save_FLAG']:
            models = {
                'nn_scalers': scalers,
                'pca': pca,
                'transfer_models': trained_transfer,
                'classifier_models': trained_classifier,
                'svm_model': svm,
                'rf_model': rf
            }
            save_models(models, params['output_dir'], str(cv_iteration_))
        
    # Output the performance indices as xlsx
    if params['save_FLAG']:
        models = {
            'transfer_models': measurements_transfer,
            'classifier_models': measurements_classifier,
            'svm_model': measurements_svm,
            'rf_model': measurements_rf
        }
        save_result_csv(models, ref, ['acc', 'recall', 'specificity', 'ppv', 'npv'], params['output_dir'])

    # Return accuracies for output
    return accs_transfer, accs_classifier, accs_svm, accs_rf
def main():
    df, model, X_train, y_train, sc = data_preparation.prepare_data()
    model = train.train(model, X_train, y_train)
    test.test(df, model, sc)
    bm.close()
def main(task_config, n=21, k=2, device=0, d=100, epochs=100):
    # Global parameters
    debug_mode = True
    verbose = True
    save = True
    freeze_word_embeddings = True
    over_population_threshold = 100
    relative_over_population = True
    data_augmentation = True
    if debug_mode:
        data_augmentation = False
        over_population_threshold = None

    logging.info("Task name: {}".format(task_config['name']))
    logging.info("Debug mode: {}".format(debug_mode))
    logging.info("Verbose: {}".format(verbose))
    logging.info("Freeze word embeddings: {}".format(freeze_word_embeddings))
    logging.info(
        "Over population threshold: {}".format(over_population_threshold))
    logging.info(
        "Relative over population: {}".format(relative_over_population))
    logging.info("Data augmentation: {}".format(data_augmentation))

    use_gpu = torch.cuda.is_available()
    # use_gpu = False
    if use_gpu:
        cuda_device = device
        torch.cuda.set_device(cuda_device)
        logging.info('Using GPU')

    # Load dataset
    dataset = task_config['dataset'](debug_mode, relative_path='./data/')

    all_sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences

    word_embeddings = load_embeddings(
        './data/glove_embeddings/glove.6B.{}d.txt'.format(d))
    chars_embeddings = load_embeddings(
        './predicted_char_embeddings/char_mimick_glove_d100_c20')

    # Prepare vectorizer
    word_to_idx, char_to_idx = make_vocab(all_sentences)
    vectorizer = WordsInContextVectorizer(word_to_idx, char_to_idx)
    vectorizer = vectorizer

    # Initialize training parameters
    model_name = '{}_n{}_k{}_d{}_e{}'.format(task_config['name'], n, k, d,
                                             epochs)
    lr = 0.001
    if debug_mode:
        model_name = 'testing_' + model_name
        save = False
        epochs = 3

    # Create the model
    net = LRComick(
        characters_vocabulary=char_to_idx,
        words_vocabulary=word_to_idx,
        characters_embedding_dimension=20,
        # characters_embeddings=chars_embeddings,
        word_embeddings_dimension=d,
        words_embeddings=word_embeddings,
        # context_dropout_p=0.5,
        # fc_dropout_p=0.5,
        freeze_word_embeddings=freeze_word_embeddings)
    model_name = "{}_{}_v{}".format(model_name, net.__class__.__name__.lower(),
                                    net.version)
    handler = logging.FileHandler('{}.log'.format(model_name))
    logger.addHandler(handler)

    model = Model(
        model=net,
        optimizer=Adam(net.parameters(), lr=lr),
        loss_function=square_distance,
        metrics=[cosine_sim],
    )
    if use_gpu:
        model.cuda()

    # Prepare examples
    train_loader, valid_loader, test_loader, oov_loader = prepare_data(
        dataset=dataset,
        embeddings=word_embeddings,
        vectorizer=vectorizer,
        n=n,
        use_gpu=use_gpu,
        k=k,
        over_population_threshold=over_population_threshold,
        relative_over_population=relative_over_population,
        data_augmentation=data_augmentation,
        debug_mode=debug_mode,
        verbose=verbose,
    )

    # Set up the callbacks and train
    train(
        model,
        model_name,
        train_loader=train_loader,
        valid_loader=valid_loader,
        epochs=epochs,
    )

    test_embeddings = evaluate(model,
                               test_loader=test_loader,
                               test_embeddings=word_embeddings,
                               save=save,
                               model_name=model_name + '.txt')

    predicted_oov_embeddings = predict_mean_embeddings(model, oov_loader)

    # Override embeddings with the training ones
    # Make sure we only have embeddings from the corpus data
    logging.info("Evaluating embeddings...")
    predicted_oov_embeddings.update(word_embeddings)

    for task in task_config['tasks']:
        logging.info("Using predicted embeddings on {} task...".format(
            task['name']))
        task['script'](predicted_oov_embeddings,
                       task['name'] + "_" + model_name, device, debug_mode)
    logger.removeHandler(handler)
#############
#  Author: Caleb Gelnar
#############

from sklearn.linear_model import LogisticRegression
from conf_matrix import func_confusion_matrix
from data_preparation import prepare_data
from sklearn import metrics

# Prepare training and Test Data by splitting in training data
X_Train, X_Test, Y_Train, Y_Test = prepare_data(test_size=0.35, seed=0)

model = LogisticRegression(penalty='l1',
                           C=8,
                           fit_intercept=True,
                           solver='liblinear',
                           max_iter=100,
                           l1_ratio=None)
model.fit(X_Train, Y_Train)
predictions = model.predict(X_Test)
conf_matrix, accuracy, recall_array, precision_array = func_confusion_matrix(
    Y_Test, predictions)
fpr, tpr, thresholds = metrics.roc_curve(Y_Test, predictions, pos_label=1)
auc = metrics.auc(fpr, tpr)
print()
print("########### MODEL PERFORMANCE ###########")
print("Confusion Matrix: ")
print(conf_matrix)
print("Average Accuracy: {}".format(accuracy))
print("Per-Class Precision: {}".format(precision_array))
print("Per-Class Recall: {}".format(recall_array))
Beispiel #18
0
def main():
    parser = argparse.ArgumentParser(
        description="launch a regression pipeline given a dataset")
    parser.add_argument(
        "dataset_filename",
        type=str,
        help="path to the dataset's .csv file",
    )
    parser.add_argument(
        "model_name",
        type=str,
        choices=[
            "linear",
            "lasso",
            "ridge",
            "elastic-net",
            "backward",
            "forward",
            "polynomial",
        ],
        help="type of model to use.",
    )
    parser.add_argument(
        "-f",
        type=str,
        default=None,
        choices=["correlation", "pca"],
        help="type of feature selection to apply (default=None)",
    )
    parser.add_argument(
        "-n",
        type=int,
        default=2,
        metavar="",
        help="number of split for the cross-validation (default=3)",
    )
    args = parser.parse_args()

    # Load the dataset and clean it
    dataset_filename = args.dataset_filename
    dataset_df = load_dataset(dataset_filename)
    prepare_data(dataset_df)
    print(f"+ Dataset {dataset_filename} loaded and cleaned "
          f"({dataset_df.shape[0]} samples)")

    # Select the features
    feature_selection = args.f
    if feature_selection:
        if feature_selection == "correlation":
            dataset_df = select_correlation_features(dataset_df)
        if feature_selection == "pca":
            dataset_df = select_pca_features(dataset_df)
        print(f"+ Preprocessing {feature_selection} applied on data")

    # Chose the model to use
    model_name = args.model_name
    if model_name == "linear":
        model = linear_regression()
    if model_name == "lasso":
        model = lasso_regression()
    if model_name == "ridge":
        model = ridge_regression()
    if model_name == "elastic-net":
        model = elastic_net_regression()
    if model_name == "backward":
        dataset_df = select_backward_features(dataset_df)
        model = linear_regression()
    if model_name == "forward":
        dataset_df = select_forward_features(dataset_df)
        model = linear_regression()
    if model_name == "polynomial":
        dataset_df = select_polynomial_features(dataset_df)
        model = linear_regression()
    print(f"+ Model {model_name} initialized \n")

    # Perform a cross validation
    n_splits = args.n
    X, y_true = get_data_arrays(dataset_df)
    A = get_predictions_cv(X, y_true, model, n_splits=n_splits)
    X_train, X_test, Y_train, Y_test, Y_pred = A
    for i in range(len(X_train)):
        print(f"[{i + 1}/{n_splits}]: Train set size: {X_train[i].shape[0]} / "
              f"Test set size: {Y_pred[i].shape[0]}")

    # Compute cross-validation, median, mean and standard dviation MSE and r2
    print("\n" + get_score_cv(Y_pred, Y_test))
from keras.models import load_model

from metrics import my_iou_metric
from data_preparation import prepare_data
from utils import rle_encoding
from skimage.util import crop

imgs_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Train/images'
mask_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Train/masks'
test_folder = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/Test/images'

train = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/train.csv'
depth = 'D:/Information Technology/Deep Learning/Projects/TGS Project/TGS Salt Identification Challenge/Data/depths.csv'

inst = prepare_data(imgs_folder, mask_folder, test_folder, train, depth)

test_data = inst.test_data_gen()

final_model = load_model('U-resnet_decoding',
                         custom_objects={'my_iou_metric': my_iou_metric})


def predict_results(model, test_data, inst):
    preds = model.predict(test_data)
    final_preds = preds[1]
    final_pred = np.array([
        crop(final_preds[i], ((13, 14), (13, 14), (0, 0))).reshape((101, 101))
        for i in range(18000)
    ])
    final_p = np.where(final_pred >= 0.5, 1, 0)
Beispiel #20
0
    if single_multi == 'single':
        log_path = 'Experimental_Evaluation/single_variate_log.csv'
    if single_multi == 'multi':
        log_path = 'Experimental_Evaluation/multi_variate_log.csv'

    log = pd.DataFrame(log_results)
    log.to_csv(log_path)
    return log


log_activity_results(data,users, list(range(2, 11)), 'diag', 'single')

data.columns


prepared_data = dp.prepare_data(data, 66, ['sleep_deep_time'])
prepared_data.columns

prepared_data.head()




def bic_criteria(data, log_likelihood, model):
    '''
    :param data: 
    :param log_likelihood: 
    :param model: 
    :return: 
    '''
    n_features = data.shape[1]  ### here adapt for multi-variate
Beispiel #21
0
#  Author: Juan Candelaria Claborne
#############

import tensorflow as tf
import itertools
import sys
from conf_matrix import func_confusion_matrix
from data_preparation import prepare_data
import tensorflow.python.util.deprecation as deprecation
from sklearn import metrics

deprecation._PRINT_DEPRECATION_WARNINGS = False

#############
# Create data
trainX, testX, trainY, testY = prepare_data(test_size=.35, seed=0)
trainY = trainY.to_numpy().astype(int)
testY = testY.to_numpy().astype(int)


train = tf.data.Dataset.from_tensor_slices((trainX, trainY)) \
    .shuffle(len(trainY)) \
    .batch(16)

#############
# Make neural network
tf.keras.backend.set_floatx('float64')
modelCount = 0
activation = ['relu', 'sigmoid', 'tanh', 'elu']
hiddenLayers = [3, 4, 5, 6, 7]
unitCounts = [4, 8, 12, 20, 28, 32]
Beispiel #22
0
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

# Loading the prepared data
if os.path.exists("data.csv"):
    data = pd.read_csv("data.csv")

else:
    data = prepare_data()

# -------- First Part: Training and evaluating a RF regressor

# - 1.1: Performing a grid search for the best parameters of the random forest regressor
# - 1.2: Train the regressor on the whole feature set with the best parameters
# - 1.3: Evaluate the regressor by calculating the outcome of the game (won home, away, equal result) and compare it with the real result from y_test

X = data.drop(['score_home', 'score_away', 'winners'], axis=1, inplace=False)
y = data.loc[:, ['score_home', 'score_away', 'winners']]

random_forest = RandomForestRegressor(n_jobs=-1)
neural_network = MLPRegressor(activation='relu',
                              solver="adam",
                              early_stopping=True)
def main():
    try:
        prepare_data()
        train_and_test()
    finally:
        bm.close()