Example #1
0
def training_pipeline(model):
    r"""AlphaPy Training Pipeline

    Parameters
    ----------
    model : alphapy.Model
        The model object for controlling the pipeline.

    Returns
    -------
    model : alphapy.Model
        The final results are stored in the model object.

    Raises
    ------
    KeyError
        If the number of columns of the train and test data do not match,
        then this exception is raised.

    """

    logger.info("Training Pipeline")

    # Unpack the model specifications

    calibration = model.specs['calibration']
    directory = model.specs['directory']
    drop = model.specs['drop']
    extension = model.specs['extension']
    feature_selection = model.specs['feature_selection']
    grid_search = model.specs['grid_search']
    model_type = model.specs['model_type']
    predict_mode = model.specs['predict_mode']
    rfe = model.specs['rfe']
    sampling = model.specs['sampling']
    scorer = model.specs['scorer']
    separator = model.specs['separator']
    target = model.specs['target']

    # Get train and test data

    X_train, y_train = get_data(model, Partition.train)
    X_test, y_test = get_data(model, Partition.test)

    # Determine if there are any test labels

    if y_test.any():
        logger.info("Test Labels Found")
        model.test_labels = True
    model = save_features(model, X_train, X_test, y_train, y_test)

    # Log feature statistics

    logger.info("Original Feature Statistics")
    logger.info("Number of Training Rows    : %d", X_train.shape[0])
    logger.info("Number of Training Columns : %d", X_train.shape[1])
    if model_type == ModelType.classification:
        uv, uc = np.unique(y_train, return_counts=True)
        logger.info("Unique Training Values for %s : %s", target, uv)
        logger.info("Unique Training Counts for %s : %s", target, uc)
    logger.info("Number of Testing Rows     : %d", X_test.shape[0])
    logger.info("Number of Testing Columns  : %d", X_test.shape[1])
    if model_type == ModelType.classification and model.test_labels:
        uv, uc = np.unique(y_test, return_counts=True)
        logger.info("Unique Testing Values for %s : %s", target, uv)
        logger.info("Unique Testing Counts for %s : %s", target, uc)

    # Merge training and test data

    if X_train.shape[1] == X_test.shape[1]:
        split_point = X_train.shape[0]
        X = pd.concat([X_train, X_test])
    else:
        raise IndexError(
            "The number of training and test columns [%d, %d] must match." %
            (X_train.shape[1], X_test.shape[1]))

    # Apply treatments to the feature matrix
    all_features = apply_treatments(model, X)

    # Drop features
    all_features = drop_features(all_features, drop)

    # Save the train and test files with extracted and dropped features

    datestamp = get_datestamp()
    data_dir = SSEP.join([directory, 'input'])
    df_train = all_features.iloc[:split_point, :]
    df_train = pd.concat(
        [df_train, pd.DataFrame(y_train, columns=[target])], axis=1)
    output_file = USEP.join([model.train_file, datestamp])
    write_frame(df_train, data_dir, output_file, extension, separator)
    df_test = all_features.iloc[split_point:, :]
    if y_test.any():
        df_test = pd.concat(
            [df_test, pd.DataFrame(y_test, columns=[target])], axis=1)
    output_file = USEP.join([model.test_file, datestamp])
    write_frame(df_test, data_dir, output_file, extension, separator)

    # Create crosstabs for any categorical features

    if model_type == ModelType.classification:
        create_crosstabs(model)

    # Create initial features

    all_features = create_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Generate interactions

    all_features = create_interactions(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Remove low-variance features

    all_features = remove_lv_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Shuffle the data [if specified]
    model = shuffle_data(model)

    # Oversampling or Undersampling [if specified]

    if model_type == ModelType.classification:
        if sampling:
            model = sample_data(model)
        else:
            logger.info("Skipping Sampling")
        # Get sample weights (classification only)
        model = get_class_weights(model)

    # Perform feature selection, independent of algorithm

    if feature_selection:
        model = select_features(model)

    # Get the available classifiers and regressors

    logger.info("Getting All Estimators")
    estimators = get_estimators(model)

    # Get the available scorers

    if scorer not in scorers:
        raise KeyError("Scorer function %s not found" % scorer)

    # Model Selection

    logger.info("Selecting Models")

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # select estimator
        try:
            estimator = estimators[algo]
            scoring = estimator.scoring
            est = estimator.estimator
        except KeyError:
            logger.info("Algorithm %s not found", algo)
        # initial fit
        model = first_fit(model, algo, est)
        # recursive feature elimination
        if rfe:
            if scoring:
                model = rfecv_search(model, algo)
            elif hasattr(est, "coef_"):
                model = rfe_search(model, algo)
            else:
                logger.info("No RFE Available for %s", algo)
        # grid search
        if grid_search:
            model = hyper_grid_search(model, estimator)
        # predictions
        model = make_predictions(model, algo, calibration)

    # Create a blended estimator

    if len(model.algolist) > 1:
        model = predict_blend(model)

    # Generate metrics

    model = generate_metrics(model, Partition.train)
    model = generate_metrics(model, Partition.test)

    # Store the best estimator
    model = predict_best(model)

    # Generate plots

    generate_plots(model, Partition.train)
    if model.test_labels:
        generate_plots(model, Partition.test)

    # Save best features and predictions
    save_model(model, 'BEST', Partition.test)

    # Return the model
    return model
Example #2
0
def plot_learning_curve(model, partition):
    r"""Generate learning curves for a given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Learning Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    shuffle = model.specs['shuffle']
    verbosity = model.specs['verbosity']

    # Get original estimators

    estimators = get_estimators(model)

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Set cross-validation parameters to get mean train and test curves.

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed)

    # Plot a learning curve for each algorithm.

    ylim = (0.4, 1.01)

    for algo in model.algolist:
        logger.info("Learning Curve for Algorithm: %s", algo)
        # get estimator
        est = estimators[algo].estimator
        # plot learning curve
        title = BSEP.join([algo, "Learning Curve [", pstring, "]"])
        # set up plot
        plt.style.use('classic')
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training Examples")
        plt.ylabel("Score")
        # call learning curve function
        train_sizes=np.linspace(0.1, 1.0, cv_folds)
        train_sizes, train_scores, test_scores = \
            learning_curve(est, X, y, train_sizes=train_sizes, cv=cv,
                           n_jobs=n_jobs, verbose=verbosity)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
        # plot data
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training Score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-Validation Score")
        plt.legend(loc="lower right")
        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)