plt.plot(base_zero_false_positives,
             base_zero_true_positives,
             label='Base rate model (zeros)')
    plt.plot(base_random_false_positives,
             base_random_true_positives,
             label='Base rate model (random)')
    # plt.plot([0, 1], [0, 1], linestyle='--', label='Random guessing expectation')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    show_plot_and_save_figure('roc_auc_curve')


if __name__ == '__main__':
    model: LogisticRegressionModel = read_pickle(
        GENERATED_LOGISTIC_REGRESSION_MODEL)
    dev_data = read_pickle(GENERATED_LR_PREPROCESSED_DEV_DATA)
    dev_input, dev_expected = extract_input_and_expected(dev_data)

    model_prediction = model.get_predictions(dev_input, args.bias_corrected)
    baserate_prediction = get_baserate_predictions(model_prediction)

    # predictions
    print('Evaluation preditions (int 0|1)')
    print('Predictions Baserate AUC: {:.2f}'.format(
        roc_auc_score(dev_expected, baserate_prediction)))
    print('Predictions Model AUC: {:.2f}'.format(
        roc_auc_score(dev_expected, model_prediction)))
    print('*' * 40)

    # probabilities
Esempio n. 2
0
import random
import unicodedata

import pandas as pd
from termcolor import colored

from dataaccess.files_constants import get_wiki_batch_path, GENERATED_WIKI_PAGE_MAPPINGS_PATH
from dataaccess.files_io import read_pickle
from model.wiki_document import WikiDocument, WikiLine

wiki_page_mapping: pd.DataFrame = read_pickle(GENERATED_WIKI_PAGE_MAPPINGS_PATH)


def retrieve_wiki_page(page_id: str) -> WikiDocument:
    page_id = page_id.strip()
    # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé'
    page_id = unicodedata.normalize('NFC', page_id)

    # Find correct batch file and read only relevant line
    batch_id, line = wiki_page_mapping.loc[page_id].values
    wiki_batch_path = get_wiki_batch_path(batch_id)

    with open(wiki_batch_path) as fp:
        for i, json_line in enumerate(fp):
            if i == line:
                return WikiDocument(json_line)

    # If this code runs, a mapping error occured
    print(colored('Error: Line {} not found in wiki-page {}'.format(line, batch_id), 'red'))

Esempio n. 3
0
import argparse

import pandas as pd

from _4_B_fit_LR_model import fit_and_get_model, LOSS_HISTORY_FREQUENCY
from dataaccess.files_constants import GENERATED_LOGISTIC_REGRESSION_MODEL, \
    GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, GENERATED_LR_PREPROCESSED_TRAINING_DATA
from dataaccess.files_io import read_pickle, write_pickle
from util.LR_NN_preprocessing import extract_input_and_expected
from util.plots import plot_loss_values

parser = argparse.ArgumentParser()
parser.add_argument('--debug', help='use less data and less learning iterations', action='store_true')
parser.add_argument('--num_iterations', type=int, default=100000)
parser.add_argument('--learning_rate', type=float, default=0.1)
args = parser.parse_args()


if __name__ == '__main__':
    training_data: pd.DataFrame = read_pickle(GENERATED_LR_PREPROCESSED_TRAINING_DATA)
    train_input, train_expected = extract_input_and_expected(training_data)

    model, loss_values = fit_and_get_model(train_input, train_expected, args.num_iterations, args.learning_rate)
    write_pickle(GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, loss_values)  # for plotting
    write_pickle(GENERATED_LOGISTIC_REGRESSION_MODEL, model)

    plot_loss_values(args.num_iterations, args.learning_rate, loss_values, LOSS_HISTORY_FREQUENCY)
Esempio n. 4
0
 def __init__(self, preprocessed_pickle_path: str):
     preprocessed_dataset: pd.DataFrame = read_pickle(preprocessed_pickle_path)
     inputs, labels = extract_input_and_expected(preprocessed_dataset)
     self.inputs = inputs
     self.labels = labels
def plot_multiple_loss_values(learning_rates: list,
                              multiple_loss_values: list):
    prepare_seaborn_plots()

    plt.xlabel('Iterations')
    plt.ylabel('Cross-Entropy Loss')

    # Fixed n for the given trained models
    plt.figtext(0.68, 0.56, r'$n = {:,}$'.format(100000))

    for i, values in enumerate(multiple_loss_values):
        label = r'$\alpha = {:,}$'.format(float(learning_rates[i]))
        x_axis = [i * LOSS_HISTORY_FREQUENCY for i in range(len(values))]
        plt.plot(x_axis, values, linewidth=2, label=label)

    plt.legend(loc='upper right')
    show_plot_and_save_figure(
        'logistic_regression_loss_values_comparision.png')


if __name__ == '__main__':
    # load the pre-computed values
    learning_rates = ['0.0001', '0.001', '0.01', '0.1', '1.0']
    filepaths = [
        './generated/logistic_regression_loss_{}.p'.format(rate)
        for rate in learning_rates
    ]
    loss_values = [read_pickle(path) for path in filepaths]

    plot_multiple_loss_values(learning_rates, loss_values)