Beispiel #1
0
def preprocess(data, pipeline_automator):
    print('\n\nPreprocessing...')
    start = time()

    # Get the cleaned data
    cleaned_data = clean_data(data, pipeline_automator)

    # Get the corpus text
    corpus_text = get_corpus_text(data)

    # Get the list of words
    words = get_words(corpus_text)

    # Get the corpus tokens
    corpus_tokens = get_tokens(corpus_text)

    # Get the bigrams, trigrams, collocations and lemmas in the data
    bigrams = get_bigrams(corpus_tokens)
    trigrams = get_trigrams(corpus_tokens)
    collocations2 = get_bigram_collocations(corpus_tokens, pipeline_automator)
    collocations3 = get_trigram_collocations(corpus_tokens, pipeline_automator)
    lemmas = get_lemmas(cleaned_data, pipeline_automator)

    if pipeline_automator.parameters['remove_sub_terms']:
        lemmas, collocations2, collocations3 = remove_redundant_terms(
            lemmas, collocations2, collocations3)

    # Get the terms that will be selected from in the feature selection step (lemmas and collocations)
    terms = lemmas + collocations2 + collocations3

    # Store all of the meta-data generated during preprocessing.
    pipeline_automator.metadata['ngrams']['words'] = words
    pipeline_automator.metadata['words_count'] = len(words)
    pipeline_automator.metadata['lemmas'] = lemmas
    pipeline_automator.metadata['lemma_count'] = len(lemmas)
    pipeline_automator.metadata['text'] = corpus_text
    pipeline_automator.metadata['ngrams']['bigrams'] = list(bigrams)
    pipeline_automator.metadata['ngrams']['trigrams'] = list(trigrams)
    pipeline_automator.metadata['bigram_collocations'] = collocations2
    pipeline_automator.metadata['bigram_collocation_count'] = len(
        collocations2)
    pipeline_automator.metadata['trigram_collocations'] = collocations3
    pipeline_automator.metadata['trigram_collocation_count'] = len(
        collocations3)
    pipeline_automator.metadata['terms'] = terms
    pipeline_automator.metadata['term_count'] = len(terms)
    stop = time()
    time_elapsed = get_time_string(stop - start)
    pipeline_automator.metadata['preprocessing_time'] = time_elapsed

    return cleaned_data
def tune_models(promising_models, pipeline_automator):
    print('\n\nModel Tuning...')
    start = time()
    # Take our dict of promising models and perform randomized search
    training_data = pipeline_automator.metadata['training_data']
    n_features = training_data.shape[1] - 1  # last column is the target column

    n_hyperparam_combos = pipeline_automator.parameters['n_hyperparam_combos']
    model_tuning_cv_folds_count = pipeline_automator.parameters[
        'model_tuning_cv_folds_count']
    include_ensemble_voters = pipeline_automator.parameters[
        'include_ensemble_voters']
    model_tuning_scoring = pipeline_automator.parameters[
        'model_tuning_scoring']

    random_state = pipeline_automator.parameters['random_state']
    np.random.seed(random_state)

    X, y = training_data[:, :n_features], training_data[:, n_features]
    X = X.astype(np.float64)
    y = y.reshape((y.shape[0], 1))
    y = np.array(y.T)[0]

    tuned_models = {}
    ignored_models = {
        'Gaussian Process Classifier':
        "This model doesn't need hyperparameter tuning because it's 'hyperparameters' were optimized at creation.",
        'Naive Bayes Classifier':
        "We have not determined if there are hyperparameters to tune for this model.",
        'Discriminant Analysis Classifier':
        "We have not determined if there are hyperparameters to tune for this model.",
        'RBF SVM Classifier': "The model autotunes gamma.",
        'Polynomial SVM Classifier': 'Tuned already'
    }

    for model_name in tqdm(promising_models, 'Tuning Promising Models...'):
        model = promising_models[model_name]
        if model_name not in ignored_models:
            print('Beginning Tuning Process for ', model_name, '...')
            param_distribution = get_parameter_distribution(
                model_name,
                random_state=random_state,
                n_iter=n_hyperparam_combos)
            random_search = RandomizedSearchCV(model,
                                               param_distribution,
                                               n_iter=n_hyperparam_combos,
                                               cv=model_tuning_cv_folds_count,
                                               scoring=model_tuning_scoring,
                                               refit=True,
                                               n_jobs=-1,
                                               random_state=random_state)
            random_search.fit(X, y)
            clf = random_search.best_estimator_
            tuned_models[model_name] = clf
        else:
            print('Skipped tuning of', model_name, 'for the following reason:',
                  ignored_models[model_name])
            tuned_models[model_name] = model
            model.fit(X, y)
            clf = model
        print('Best Tuning:')
        print(clf)

    tuned_models_list = tuned_models.items()

    stop = time()
    time_elapsed = get_time_string(stop - start)
    pipeline_automator.metadata['model_tuning_time'] = time_elapsed

    return tuned_models
def get_promising_models(feature_selected_data, pipeline_automator):
    """
    Automates the process of selecting a good classifier, hyperparameter tuning for the classifier
    and training and testing the model. This process is very slow so we start by limiting the data
    that we want the models to be evaluated on. The current version takes 15 minutes or so to 
    run on using 1500 records which is acceptable for testing purposes all things considered (training dozens of
    classifiers on 1500 records and hundreds of features)
    """    
    print('\n\nModel Selection...')
    start = time()

    random_state = pipeline_automator.parameters['random_state']
    np.random.seed(random_state)

    n_records = feature_selected_data.shape[0]
    n_features = feature_selected_data.shape[1] - 1 # n_features = n_cols - 1, (last column is the target column)
    split_type = pipeline_automator.parameters['data_splitting']

    """
    1) Split our data up into development aka training set for the model selection and tuning data, and
    a validation set to see how well our models can classify never before seen examples after we tune them.
    We want to use a lot of data for developing good models, but we also want to set aside
    enough data to test how well our models can generalize to unseen examples. """

    print('Splitting Our Data into training and testing sets...')
    testing_set_size = pipeline_automator.parameters['testing_set_size']
    train_data, test_data = split_data(feature_selected_data, test_size=testing_set_size, type_=split_type, random_state=random_state)

    n_examples_in_training_set = len(train_data)

    pipeline_automator.metadata['training_data'] = train_data
    pipeline_automator.metadata['training_data_size'] = n_examples_in_training_set
    pipeline_automator.metadata['testing_data'] = test_data
    pipeline_automator.metadata['testing_data_size'] = len(test_data)
    print('total examples:', 
        len(feature_selected_data), 
        '\ttraining examples:',  
        len(train_data), 
        '\ttesting examples:', 
        len(test_data),end='\n\n')

    """
    2) Enforce the Model Selection Data Limit. Model selection is very slow. We presently have 13 classifier types.
    Each classifier is cloned cv_folds number of times and then trained on the training data. For cv=10, that's 130
    classifiers being trained and tested. If we trained all of those models on the entire data set the computational
    complexity grows in a combinatorial way, i.e., will not finish in a reasonable amount time. Therefore, we get around
    this by using a subset if we have more records than the model_selection_data_limit. If we do not have more than that many
    records in the training set, then we use the entire training set to select our models.
    """
    model_selection_data_limit = pipeline_automator.parameters['model_selection_data_limit']

    if model_selection_data_limit < n_examples_in_training_set :
        print('Splitting our data because it exceeds the model_selection_data_limit parameter...')
        rest_of_the_data_set, evaluation_set = split_data(feature_selected_data, test_size=model_selection_data_limit, type_=split_type, random_state=random_state)
        print(len(rest_of_the_data_set),'saved for later +',len(evaluation_set),'evaluation')
    else:
        rest_of_the_data_set = []
        evaluation_set = feature_selected_data

    """
    3) Split our training data up into feature matrix X and label vector y:
    """
    X, y = evaluation_set[:,:n_features], evaluation_set[:,n_features]
    X = X.astype(np.float64)
    y = y.reshape( (y.shape[0], 1) )
    y = np.array(y.T)[0]

    """
    4) Place all our generated classifier functions above into a list of callable functions that we iterate
    through and call one at a time on the training data X and y. Then each classifier trains the data 
    and uses cross validation in order to assess how good that particular classifier is at classifying the data.
    A confusion matrix for each is computed and calculates the accuracy, precision, recall, etc..
    Depending on the needs of the classifier(maximize precision vs. recall, balanced, etc.), we select the appropriate
    rating indicator, and rate the classifiers by their performance.
    """
    # list of callables used to generate each classifier:
    cv_folds = pipeline_automator.parameters['model_selection_cv_folds_count']
    classifiers_getters =  [
        get_sgd_classifier,
        get_knn_classifier,
        get_linear_svm_classifier,
        get_polynomial_svm_classifier,
        get_rbf_svm_classifier,
        get_decision_tree_classifier,
        get_random_forest_classifier,
        get_extra_trees_classifier,
        get_adaboost_forest_classifier,
        get_mlp_classifier
    ]
    classifiers = [ get_classifier(X, y, random_state=random_state, cv_folds=cv_folds) for get_classifier in classifiers_getters ]
    classifier_dictionary = {name: clf for name, clf, results in classifiers}

    """
    5) Choose the most promising models to send to the next phase: model tuning.
    """
    # Rank the classifiers by their F1 score (for now) we can decide which metric or even automate this in
    # a pipeline hyperparameter later on.
    performance_metric = pipeline_automator.parameters['model_selection_performance_metric']
    ranked_classifiers = sorted( classifiers, key=lambda x: x[-1][performance_metric], reverse=True )

    for name, classifier, results in ranked_classifiers:
        print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}")
    print()

    # Filter the ones with low performance.
    min_performance = pipeline_automator.parameters['model_selection_min_performance']
    print('Filtering models that do not meet the model selection minimum performance: ', performance_metric, '>=', min_performance,'...')
    ranked_classifiers = list(filter(lambda x: x[-1][performance_metric] >= min_performance, classifiers))
    ranked_classifiers = sorted( ranked_classifiers, key=lambda x: x[-1][performance_metric], reverse=True )

    if len(ranked_classifiers) > 0:
        for name, classifier, results in ranked_classifiers:
            print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}")
        print()
    else:
        print('None of the models met the minimum performance of', performance_metric, '>=', min_performance)

    n_promising_models_to_select = min(pipeline_automator.parameters['n_promising_models_to_select'], len(ranked_classifiers))
    
    promising_models = [ (promising__clf_name, promising__classifier, promising__results) for promising__clf_name, promising__classifier, promising__results in ranked_classifiers[:n_promising_models_to_select]]

    print('Selecting up to the top',len(promising_models),'models...')

    if len(ranked_classifiers) > 0:
        for name, classifier, results in promising_models:
            print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}")
        print()
    else:
        print('None of the models met the minimum performance of', performance_metric, '>=', min_performance)

    promising_models = {promising__clf_name:promising__classifier for promising__clf_name, promising__classifier, promising__results in promising_models}

    stop = time()
    time_elapsed = get_time_string(stop - start)
    pipeline_automator.metadata['model_selection_time'] = time_elapsed

    return promising_models
def evaluate_best_models(tuned_models, pipeline_automator):
    import numpy as np
    performance_metric = pipeline_automator.parameters[
        'final_min_performance_metric']
    min_performance = pipeline_automator.parameters['final_min_performance']

    # This is used to control reproducibility in the output.
    random_state = pipeline_automator.parameters['random_state']
    np.random.seed(random_state)

    print('\n\nModel Evaluation...')
    start = time()
    # Prepare features matrix and label vector for final evaluation.
    test_data = pipeline_automator.metadata['testing_data']
    n_features = test_data.shape[1] - 1  # last column is the target column

    X, y = test_data[:, :n_features], test_data[:, n_features]
    X = X.astype(np.float64)
    y = y.reshape((y.shape[0], 1))
    y = np.array(y.T)[0]

    # For each model in tuned models, calculate the F1-score on the test data and select the model with best F1-score. These models are already trained.
    classifiers = []

    for model_name in tuned_models:
        model = tuned_models[model_name]
        y_pred = model.predict(X)
        cm = confusion_matrix(y, y_pred)

        balanced_accuracy = balanced_accuracy_score(
            y,
            y_pred,
        )
        precision = precision_score(y,
                                    y_pred,
                                    pos_label='Complaint',
                                    average='macro')
        recall = recall_score(y,
                              y_pred,
                              pos_label='Complaint',
                              average='macro')
        f1 = f1_score(y, y_pred, pos_label='Complaint', average='macro')

        results = {
            'confusion matrix': cm,
            'balanced accuracy': balanced_accuracy,
            'precision': precision,
            'recall': recall,
            'f1_macro': f1
        }
        print('Final Evaluation of Tuned', model_name, ':')
        print(cm)
        print('balanced accuracy:', balanced_accuracy)
        print(classification_report(y, y_pred))
        print()
        classifiers.append((model_name, model, results))

    ranked_classifiers = sorted(classifiers,
                                key=lambda x: x[-1]['f1_macro'],
                                reverse=True)

    for name, classifier, results in ranked_classifiers:
        print(f"{name:<35s}\tF1-score:{results['f1_macro']:<10f}")
    print()

    best_model_name, best_model, best_model_results = ranked_classifiers[0]

    if best_model_results[performance_metric] >= min_performance:
        print('The best model Pipeline Automator found was the',
              best_model_name, '.')
        print('results:')
        print(best_model_results['confusion matrix'])
        print('balanced accuracy:', best_model_results['balanced accuracy'])
        print('precision:', best_model_results['precision'])
        print('recall:', best_model_results['recall'])
        print('f1-score:', best_model_results['f1_macro'])
        print()
    else:
        print('No model had minimum performace of', performance_metric, '>=',
              min_performance,
              '. Please adjust the pipeline parameters and try again.\n')
        best_model_name, best_model, best_model_results = None, None, None
    stop = time()
    time_elapsed = get_time_string(stop - start)
    pipeline_automator.metadata['model_evaluation_time'] = time_elapsed

    return best_model_name, best_model, best_model_results
def feature_select(data, pipeline_automator):
    print('\n\nFeature Selection... ')
    start = time()

    # seed the random number generator for reproducibility.
    random_state = pipeline_automator.parameters['random_state']
    np.random.seed(random_state)

    # Take our cleaned data and terms and transform into a TF-IDF matrix with L2-normalization applied to the row vectors
    terms = pipeline_automator.metadata['terms']
    print('Getting TF-IDF Matrix...')
    tfidf_matrix, tfidf_terms = get_tfidf_matrix(data,
                                                 pipeline_automator,
                                                 print_matrix=True)

    # Add the Meta-Features to the tf-idf matrix
    print('Adding Meta-Features...')
    meta_features_matrix, meta_features_col_names = get_meta_features_matrix(
        data)
    if meta_features_matrix is not None:
        X, y = tfidf_matrix[:, :-1], tfidf_matrix[:, -1]
        X = X.astype(np.float64)
        y = y.reshape((y.shape[0], 1))
        y = np.array(y.T)[0]
        meta_features_tfidf_matrix = np.column_stack(
            [X, meta_features_matrix, y])
        features = tfidf_terms + meta_features_col_names
        print('tfidf + meta features shape:', meta_features_tfidf_matrix.shape)
    else:
        meta_features_tfidf_matrix = tfidf_matrix

    features_count = int(meta_features_tfidf_matrix.shape[1] -
                         1)  #exclude the label column

    # Selected Features Matrix
    # Use the specified feature selection metric and the number of features to keep to determine which terms aka features to select
    print('Performing Univariate Feature Selection...')
    feature_selection_metric = pipeline_automator.parameters[
        'feature_selection_metric']
    n_features_to_keep = int(
        features_count * pipeline_automator.parameters['n_selected_features']
    )  # number / ratio of features to select

    top_features, top_features_matrix = univariate_feature_selection(
        meta_features_tfidf_matrix, n_features_to_keep,
        feature_selection_metric, features, pipeline_automator)
    print('reduced tfidf shape:', top_features_matrix.shape)

    if pipeline_automator.parameters['remove_zero_length_vectors']:
        # Some records may not contain any of the selected features and should thus be ignored.
        top_features_matrix = remove_zero_length_vectors(top_features_matrix)

    print('Selected Features Matrix shape:', top_features_matrix.shape)
    print(top_features_matrix)

    for feature in top_features:
        print(feature)

    if pipeline_automator.parameters['use_L2_row_normalization']:
        # the remaining vectors are then normalized one last time to take the meta features into account.
        top_features_matrix = normalize_matrix(top_features_matrix)

    # Cache the metadata
    zero_row_vector_count = len(data) - len(top_features_matrix)
    feature_selection_matrix_shape = top_features_matrix.shape
    pipeline_automator.metadata['features'] = features
    pipeline_automator.metadata['features_count'] = len(features)
    pipeline_automator.metadata['selected_features'] = top_features
    pipeline_automator.metadata['selected_features_count'] = n_features_to_keep
    pipeline_automator.metadata[
        'zero_row_vector_count'] = zero_row_vector_count
    pipeline_automator.metadata[
        'feature_selected_matrix_shape'] = feature_selection_matrix_shape
    pipeline_automator.metadata[
        'feature_selected_matrix'] = top_features_matrix
    stop = time()
    time_elapsed = get_time_string(stop - start)
    pipeline_automator.metadata['feature_selection_time'] = time_elapsed
    # Return the selected terms tf-idf-L2 scaled matrix representation of the data
    return top_features_matrix
Beispiel #6
0
    """
    This is the main file of Pipeline Automator. 
    Run this file using the following the command in a terminal/command prompt window:

    python main.py "file_name.csv" description_column label_column
    """
    start = time()
    commandline_arguments = argv[1:]

    if len(commandline_arguments) != 3:
        raise ValueError(
            "Must specify 3 arguments: the file name, the description column name, and the target column name."
        )
    file_name, feature_column, label_column = commandline_arguments
    parameters = {
        'feature_col_name': feature_column,
        'label_col_name': label_column
    }

    # Initialize the pipeline and display the parameters used...
    pipeline = PipelineAutomator(file_name, parameters)

    # Commence the first run cycle through the pipeline...
    pipeline.display_parameters()
    record_type_classifier = pipeline.generate_model()
    pipeline.display_metadata()

    # Compute and show the run time:
    stop = time()
    time_elapsed = get_time_string(stop - start)
    print('Time elapsed:', time_elapsed)