def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model
def plot_learning_curve(model, partition): r"""Generate learning curves for a given partition. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Learning Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] seed = model.specs['seed'] shuffle = model.specs['shuffle'] verbosity = model.specs['verbosity'] # Get original estimators estimators = get_estimators(model) # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Set cross-validation parameters to get mean train and test curves. cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed) # Plot a learning curve for each algorithm. ylim = (0.4, 1.01) for algo in model.algolist: logger.info("Learning Curve for Algorithm: %s", algo) # get estimator est = estimators[algo].estimator # plot learning curve title = BSEP.join([algo, "Learning Curve [", pstring, "]"]) # set up plot plt.style.use('classic') plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training Examples") plt.ylabel("Score") # call learning curve function train_sizes=np.linspace(0.1, 1.0, cv_folds) train_sizes, train_scores, test_scores = \ learning_curve(est, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, verbose=verbosity) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() # plot data plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-Validation Score") plt.legend(loc="lower right") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)