def prediction_pipeline(model): r"""AlphaPy Prediction Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- None : None Notes ----- The saved model is loaded from disk, and predictions are made on the new testing data. """ logger.info("Predict Mode") # Unpack the model specifications directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] model_type = model.specs['model_type'] rfe = model.specs['rfe'] separator = model.specs['separator'] # Get all data. We need original train and test for interactions. partition = Partition.predict X_predict, _ = get_data(model, partition) # Load feature_map model = load_feature_map(model, directory) # Log feature statistics logger.info("Feature Statistics") logger.info("Number of Prediction Rows : %d", X_predict.shape[0]) logger.info("Number of Prediction Columns : %d", X_predict.shape[1]) # Apply treatments to the feature matrix all_features = apply_treatments(model, X_predict) # Drop features all_features = drop_features(all_features, drop) # Create initial features all_features = create_features(model, all_features) # Generate interactions all_features = create_interactions(model, all_features) # Remove low-variance features all_features = remove_lv_features(model, all_features) # Load the univariate support vector, if any if feature_selection: logger.info("Getting Univariate Support") try: support = model.feature_map['uni_support'] all_features = all_features[:, support] logger.info("New Feature Count : %d", all_features.shape[1]) except: logger.info("No Univariate Support") # Load the RFE support vector, if any if rfe: logger.info("Getting RFE Support") try: support = model.feature_map['rfe_support'] all_features = all_features[:, support] logger.info("New Feature Count : %d", all_features.shape[1]) except: logger.info("No RFE Support") # Load predictor predictor = load_predictor(directory) # Make predictions logger.info("Making Predictions") tag = 'BEST' model.preds[(tag, partition)] = predictor.predict(all_features) if model_type == ModelType.classification: model.probas[(tag, partition)] = predictor.predict_proba(all_features)[:, 1] # Get date stamp to record file creation d = datetime.now() f = "%Y%m%d" timestamp = d.strftime(f) # Save predictions save_predictions(model, tag, partition)
def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model
def prediction_pipeline(model): r"""AlphaPy Prediction Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- None : None Notes ----- The saved model is loaded from disk, and predictions are made on the new testing data. """ logger.info("Predict Mode") # Unpack the model specifications directory = model.specs['directory'] drop = model.specs['drop'] feature_selection = model.specs['feature_selection'] model_type = model.specs['model_type'] rfe = model.specs['rfe'] # Get all data. We need original train and test for encodings. X_train, y_train = get_data(model, Partition.train) partition = Partition.predict X_predict, _ = get_data(model, partition) # Load feature_map model = load_feature_map(model, directory) # Log feature statistics logger.info("Feature Statistics") logger.info("Number of Prediction Rows : %d", X_predict.shape[0]) logger.info("Number of Prediction Columns : %d", X_predict.shape[1]) # Apply transforms to the feature matrix X_all = apply_transforms(model, X_predict) # Drop features X_all = drop_features(X_all, drop) # Create initial features X_all = create_features(model, X_all, X_train, X_predict, y_train) # Generate interactions X_all = create_interactions(model, X_all) # Remove low-variance features X_all = remove_lv_features(model, X_all) # Load the univariate support vector, if any if feature_selection: logger.info("Getting Univariate Support") try: support = model.feature_map['uni_support'] X_all = X_all[:, support] logger.info("New Feature Count : %d", X_all.shape[1]) except: logger.info("No Univariate Support") # Load the RFE support vector, if any if rfe: logger.info("Getting RFE Support") try: support = model.feature_map['rfe_support'] X_all = X_all[:, support] logger.info("New Feature Count : %d", X_all.shape[1]) except: logger.info("No RFE Support") # Load predictor predictor = load_predictor(directory) # Make predictions logger.info("Making Predictions") tag = 'BEST' model.preds[(tag, partition)] = predictor.predict(X_all) if model_type == ModelType.classification: model.probas[(tag, partition)] = predictor.predict_proba(X_all)[:, 1] # Save predictions save_predictions(model, tag, partition) # Return the model return model