def create_clusters(features, model): r"""Cluster the given features. Parameters ---------- features : numpy array The features to cluster. model : alphapy.Model The model object with the clustering parameters. Returns ------- cfeatures : numpy array The calculated clusters. cnames : list The cluster feature names. References ---------- You can find more information on clustering here [CLUS]_. .. [CLUS] http://scikit-learn.org/stable/modules/clustering.html """ logger.info("Creating Clustering Features") # Extract model parameters cluster_inc = model.specs['cluster_inc'] cluster_max = model.specs['cluster_max'] cluster_min = model.specs['cluster_min'] seed = model.specs['seed'] # Log model parameters logger.info("Cluster Minimum : %d", cluster_min) logger.info("Cluster Maximum : %d", cluster_max) logger.info("Cluster Increment : %d", cluster_inc) # Generate clustering features cfeatures = np.zeros((features.shape[0], 1)) cnames = [] for i in range(cluster_min, cluster_max + 1, cluster_inc): logger.info("k = %d", i) km = MiniBatchKMeans(n_clusters=i, random_state=seed) km.fit(features) labels = km.predict(features) labels = labels.reshape(-1, 1) cfeatures = np.column_stack((cfeatures, labels)) cnames.append(USEP.join(['cluster', str(i)])) cfeatures = np.delete(cfeatures, 0, axis=1) # Return new clustering features logger.info("Clustering Feature Count : %d", cfeatures.shape[1]) return cfeatures, cnames
def create_isomap_features(features, model): r"""Create Isomap features. Parameters ---------- features : numpy array The input features. model : alphapy.Model The model object with the Isomap parameters. Returns ------- ifeatures : numpy array The Isomap features. inames : list The Isomap feature names. Notes ----- Isomaps are very memory-intensive. Your process will be killed if you run out of memory. References ---------- You can find more information on Principal Component Analysis here [ISO]_. .. [ISO] http://scikit-learn.org/stable/modules/manifold.html#isomap """ logger.info("Creating Isomap Features") # Extract model parameters iso_components = model.specs['iso_components'] iso_neighbors = model.specs['iso_neighbors'] n_jobs = model.specs['n_jobs'] # Log model parameters logger.info("Isomap Components : %d", iso_components) logger.info("Isomap Neighbors : %d", iso_neighbors) # Generate Isomap features model = Isomap(n_neighbors=iso_neighbors, n_components=iso_components, n_jobs=n_jobs) ifeatures = model.fit_transform(features) inames = [USEP.join(['isomap', str(i + 1)]) for i in range(iso_components)] # Return new Isomap features logger.info("Isomap Feature Count : %d", ifeatures.shape[1]) return ifeatures, inames
def create_pca_features(features, model): r"""Apply Principal Component Analysis (PCA) to the features. Parameters ---------- features : numpy array The input features. model : alphapy.Model The model object with the PCA parameters. Returns ------- pfeatures : numpy array The PCA features. pnames : list The PCA feature names. References ---------- You can find more information on Principal Component Analysis here [PCA]_. .. [PCA] http://scikit-learn.org/stable/modules/decomposition.html#pca """ logger.info("Creating PCA Features") # Extract model parameters pca_inc = model.specs['pca_inc'] pca_max = model.specs['pca_max'] pca_min = model.specs['pca_min'] pca_whiten = model.specs['pca_whiten'] # Log model parameters logger.info("PCA Minimum : %d", pca_min) logger.info("PCA Maximum : %d", pca_max) logger.info("PCA Increment : %d", pca_inc) logger.info("PCA Whitening : %r", pca_whiten) # Generate clustering features pfeatures = np.zeros((features.shape[0], 1)) pnames = [] for i in range(pca_min, pca_max+1, pca_inc): logger.info("n_components = %d", i) X_pca = PCA(n_components=i, whiten=pca_whiten).fit_transform(features) pfeatures = np.column_stack((pfeatures, X_pca)) pnames.append(USEP.join(['pca', str(i)])) pfeatures = np.delete(pfeatures, 0, axis=1) # Return new clustering features logger.info("PCA Feature Count : %d", pfeatures.shape[1]) return pfeatures, pnames
def get_numerical_features(fnum, fname, df, nvalues, dt, sentinel, logt, plevel): r"""Transform numerical features with imputation and possibly log-transformation. Parameters ---------- fnum : int Feature number, strictly for logging purposes fname : str Name of the numerical column in the dataframe ``df``. df : pandas.DataFrame Dataframe containing the column ``fname``. nvalues : int The number of unique values. dt : str The values ``'float64'``, ``'int64'``, or ``'bool'``. sentinel : float The number to be imputed for NaN values. logt : bool If ``True``, then log-transform numerical values. plevel : float The p-value threshold to test if a feature is normally distributed. Returns ------- new_values : numpy array The set of imputed and transformed features. new_fnames : list The new feature name(s) for the numerical variable. """ feature = df[fname] if len(feature) == nvalues: logger.info( "Feature %d: %s is a numerical feature of type %s with maximum number of values %d", fnum, fname, dt, nvalues) else: logger.info( "Feature %d: %s is a numerical feature of type %s with %d unique values", fnum, fname, dt, nvalues) # imputer for float, integer, or boolean data types new_values = impute_values(feature, dt, sentinel) # log-transform any values that do not fit a normal distribution new_fname = fname if logt and np.all(new_values > 0): _, pvalue = sps.normaltest(new_values) if pvalue <= plevel: logger.info( "Feature %d: %s is not normally distributed [p-value: %f]", fnum, fname, pvalue) new_values = np.log(new_values) else: new_fname = USEP.join([new_fname, 'log']) return new_values, [new_fname]
def plot_importance(model, partition): r"""Display scikit-learn feature importances. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Feature Importance Plots") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Get X, Y for correct partition X, y = get_partition_data(model, partition) # For each algorithm that has importances, generate the plot. n_top = 10 for algo in model.algolist: logger.info("Feature Importances for Algorithm: %s", algo) try: importances = model.importances[algo] # forest was input parameter indices = np.argsort(importances)[::-1] # log the feature ranking logger.info("Feature Ranking:") for f in range(n_top): logger.info("%d. Feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # plot the feature importances title = BSEP.join([algo, "Feature Importances [", pstring, "]"]) plt.style.use('classic') plt.figure() plt.title(title) plt.bar(list(range(n_top)), importances[indices][:n_top], color="b", align="center") plt.xticks(list(range(n_top)), indices[:n_top]) plt.xlim([-1, n_top]) # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir) except: logger.info("%s does not have feature importances", algo)
def create_tsne_features(features, model): r"""Create t-SNE features. Parameters ---------- features : numpy array The input features. model : alphapy.Model The model object with the t-SNE parameters. Returns ------- tfeatures : numpy array The t-SNE features. tnames : list The t-SNE feature names. References ---------- You can find more information on the t-SNE technique here [TSNE]_. .. [TSNE] http://scikit-learn.org/stable/modules/manifold.html#t-distributed-stochastic-neighbor-embedding-t-sne """ logger.info("Creating T-SNE Features") # Extract model parameters seed = model.specs['seed'] tsne_components = model.specs['tsne_components'] tsne_learn_rate = model.specs['tsne_learn_rate'] tsne_perplexity = model.specs['tsne_perplexity'] # Log model parameters logger.info("T-SNE Components : %d", tsne_components) logger.info("T-SNE Learning Rate : %d", tsne_learn_rate) logger.info("T-SNE Perplexity : %d", tsne_perplexity) # Generate T-SNE features model = TSNE(n_components=tsne_components, perplexity=tsne_perplexity, learning_rate=tsne_learn_rate, random_state=seed) tfeatures = model.fit_transform(features) tnames = [USEP.join(['tsne', str(i + 1)]) for i in range(tsne_components)] # Return new T-SNE features logger.info("T-SNE Feature Count : %d", tfeatures.shape[1]) return tfeatures, tnames
def analysis_name(gname, target): r"""Get the name of the analysis. Parameters ---------- gname : str Group name. target : str Target of the analysis. Returns ------- name : str Value for the corresponding key. """ name = USEP.join([gname, target]) return name
def space_name(subject, schema, fractal): r"""Get the namespace string. Parameters ---------- subject : str An identifier for a group of related items. schema : str The data related to the ``subject``. fractal : str The time fractal of the data, e.g., "5m" or "1d". Returns ------- name : str The joined namespace string. """ name = USEP.join([subject, schema, fractal]) return name
def frame_name(name, space): r"""Get the frame name for the given name and space. Parameters ---------- name : str Group name. space : alphapy.Space Context or namespace for the given group name. Returns ------- fname : str Frame name. Examples -------- >>> fname = frame_name('tech', Space('stock', 'prices', '1d')) # 'tech_stock_prices_1d' """ return USEP.join([name, space.subject, space.schema, space.fractal])
def save_model(model, tag, partition): r"""Save the results in the model file. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None Notes ----- The following components are extracted from the model object and saved to disk: * Model predictor (via joblib/pickle) * Predictions * Probabilities (classification only) * Rankings * Submission File (optional) """ logger.info('=' * 80) # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] submission_file = model.specs['submission_file'] submit_probas = model.specs['submit_probas'] # Get date stamp to record file creation d = datetime.now() f = "%Y%m%d" timestamp = d.strftime(f) # Save the model predictor save_predictor(model, timestamp) # Save the feature map save_feature_map(model, timestamp) # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Save predictions preds, probas = save_predictions(model, tag, partition) # Generate submission file if submission_file: sample_spec = PSEP.join([submission_file, extension]) sample_input = SSEP.join([input_dir, sample_spec]) ss = pd.read_csv(sample_input) if submit_probas and model_type == ModelType.classification: ss[ss.columns[1]] = probas else: ss[ss.columns[1]] = preds submission_base = USEP.join(['submission', timestamp]) submission_spec = PSEP.join([submission_base, extension]) submission_output = SSEP.join([output_dir, submission_spec]) logger.info("Saving Submission to %s", submission_output) ss.to_csv(submission_output, index=False)
def save_predictions(model, tag, partition): r"""Save the predictions to disk. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- preds : numpy array The prediction vector. probas : numpy array The probability vector. """ # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] separator = model.specs['separator'] # Get date stamp to record file creation timestamp = get_datestamp() # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Read the prediction frame file_spec = ''.join([datasets[partition], '*']) file_name = most_recent_file(input_dir, file_spec) file_name = file_name.split(SSEP)[-1].split(PSEP)[0] pf = read_frame(input_dir, file_name, extension, separator) # Cull records before the prediction date try: predict_date = model.specs['predict_date'] found_pdate = True except: found_pdate = False if found_pdate: pd_indices = pf[pf.date >= predict_date].index.tolist() pf = pf.iloc[pd_indices] else: pd_indices = pf.index.tolist() # Save predictions for all projects logger.info("Saving Predictions") output_file = USEP.join(['predictions', timestamp]) preds = model.preds[(tag, partition)].squeeze() if found_pdate: preds = np.take(preds, pd_indices) pred_series = pd.Series(preds, index=pd_indices) df_pred = pd.DataFrame(pred_series, columns=['prediction']) write_frame(df_pred, output_dir, output_file, extension, separator) # Save probabilities for classification projects probas = None if model_type == ModelType.classification: logger.info("Saving Probabilities") output_file = USEP.join(['probabilities', timestamp]) probas = model.probas[(tag, partition)].squeeze() if found_pdate: probas = np.take(probas, pd_indices) prob_series = pd.Series(probas, index=pd_indices) df_prob = pd.DataFrame(prob_series, columns=['probability']) write_frame(df_prob, output_dir, output_file, extension, separator) # Save ranked predictions logger.info("Saving Ranked Predictions") pf['prediction'] = pred_series if model_type == ModelType.classification: pf['probability'] = prob_series pf.sort_values('probability', ascending=False, inplace=True) else: pf.sort_values('prediction', ascending=False, inplace=True) output_file = USEP.join(['rankings', timestamp]) write_frame(pf, output_dir, output_file, extension, separator) # Return predictions and any probabilities return preds, probas
def create_features(model, X): r"""Create features for the train and test set. Parameters ---------- model : alphapy.Model Model object with the feature specifications. X : pandas.DataFrame Combined train and test data. Returns ------- all_features : numpy array The new features. Raises ------ TypeError Unrecognized data type. """ # Extract model parameters clustering = model.specs['clustering'] counts_flag = model.specs['counts'] encoder = model.specs['encoder'] factors = model.specs['factors'] isomap = model.specs['isomap'] logtransform = model.specs['logtransform'] model_type = model.specs['model_type'] ngrams_max = model.specs['ngrams_max'] numpy_flag = model.specs['numpy'] pca = model.specs['pca'] pvalue_level = model.specs['pvalue_level'] rounding = model.specs['rounding'] scaling = model.specs['scaler_option'] scaler = model.specs['scaler_type'] scipy_flag = model.specs['scipy'] sentinel = model.specs['sentinel'] target_value = model.specs['target_value'] tsne = model.specs['tsne'] vectorize = model.specs['vectorize'] # Log input parameters logger.info("Original Features : %s", X.columns) logger.info("Feature Count : %d", X.shape[1]) # Set classification flag classify = True if model_type == ModelType.classification else False # Count zero and NaN values if counts_flag: logger.info("Creating Count Features") logger.info("NA Counts") X['nan_count'] = X.count(axis=1) logger.info("Number Counts") for i in range(10): fc = USEP.join(['count', str(i)]) X[fc] = (X == i).astype(int).sum(axis=1) logger.info("New Feature Count : %d", X.shape[1]) # Iterate through columns, dispatching and transforming each feature. logger.info("Creating Base Features") all_features = np.zeros((X.shape[0], 1)) for i, fc in enumerate(X): fnum = i + 1 dtype = X[fc].dtypes nunique = len(X[fc].unique()) # standard processing of numerical, categorical, and text features if factors and fc in factors: features = get_factors(model, X, fnum, fc, nunique, dtype, encoder, rounding, sentinel) elif dtype == 'float64' or dtype == 'int64' or dtype == 'bool': features = get_numerical_features(fnum, fc, X, nunique, dtype, sentinel, logtransform, pvalue_level) elif dtype == 'object': features = get_text_features(fnum, fc, X, nunique, vectorize, ngrams_max) else: raise TypeError("Base Feature Error with unrecognized type %s" % dtype) if features.shape[0] == all_features.shape[0]: all_features = np.column_stack((all_features, features)) else: logger.info("Feature %s has the wrong number of rows: %d", fc, features.shape[0]) all_features = np.delete(all_features, 0, axis=1) logger.info("New Feature Count : %d", all_features.shape[1]) # Call standard scaler for all features if scaling: logger.info("Scaling Base Features") if scaler == Scalers.standard: all_features = StandardScaler().fit_transform(all_features) elif scaler == Scalers.minmax: all_features = MinMaxScaler().fit_transform(all_features) else: logger.info("Unrecognized scaler: %s", scaler) else: logger.info("Skipping Scaling") # Perform dimensionality reduction only on base feature set base_features = all_features # Calculate the total, mean, standard deviation, and variance if numpy_flag: np_features = create_numpy_features(base_features, sentinel) all_features = np.column_stack((all_features, np_features)) logger.info("New Feature Count : %d", all_features.shape[1]) # Generate scipy features if scipy_flag: sp_features = create_scipy_features(base_features, sentinel) all_features = np.column_stack((all_features, sp_features)) logger.info("New Feature Count : %d", all_features.shape[1]) # Create clustering features if clustering: cfeatures = create_clusters(base_features, model) all_features = np.column_stack((all_features, cfeatures)) logger.info("New Feature Count : %d", all_features.shape[1]) # Create PCA features if pca: pfeatures = create_pca_features(base_features, model) all_features = np.column_stack((all_features, pfeatures)) logger.info("New Feature Count : %d", all_features.shape[1]) # Create Isomap features if isomap: ifeatures = create_isomap_features(base_features, model) all_features = np.column_stack((all_features, ifeatures)) logger.info("New Feature Count : %d", all_features.shape[1]) # Create T-SNE features if tsne: tfeatures = create_tsne_features(base_features, model) all_features = np.column_stack((all_features, tfeatures)) logger.info("New Feature Count : %d", all_features.shape[1]) # Return all transformed training and test features return all_features
def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model
def plot_validation_curve(model, partition, pname, prange): r"""Generate scikit-learn validation curves. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. pname : str Name of the hyperparameter to test. prange : numpy array The values of the hyperparameter that will be evaluated. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py """ logger.info("Generating Validation Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] scorer = model.specs['scorer'] verbosity = model.specs['verbosity'] # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Define plotting constants. spacing = 0.5 alpha = 0.2 # Calculate a validation curve for each algorithm. for algo in model.algolist: logger.info("Algorithm: %s", algo) # get estimator estimator = model.estimators[algo] # set up plot train_scores, test_scores = validation_curve( estimator, X, y, param_name=pname, param_range=prange, cv=cv_folds, scoring=scorer, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # set up figure plt.style.use('classic') plt.figure() # plot learning curves title = BSEP.join([algo, "Validation Curve [", pstring, "]"]) plt.title(title) # x-axis x_min, x_max = min(prange) - spacing, max(prange) + spacing plt.xlabel(pname) plt.xlim(x_min, x_max) # y-axis plt.ylabel("Score") plt.ylim(0.0, 1.1) # plot scores plt.plot(prange, train_scores_mean, label="Training Score", color="r") plt.fill_between(prange, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=alpha, color="r") plt.plot(prange, test_scores_mean, label="Cross-Validation Score", color="g") plt.fill_between(prange, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=alpha, color="g") plt.legend(loc="best") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
def plot_confusion_matrix(model, partition): r"""Draw the confusion matrix. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix """ logger.info("Generating Confusion Matrices") plot_dir = get_plot_directory(model) pstring = datasets[partition] # For classification only if model.specs['model_type'] != ModelType.classification: logger.info('Confusion Matrix is for classification only') return None # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Plot Parameters np.set_printoptions(precision=2) cmap = plt.cm.Blues fmt = '.2f' # Generate a Confusion Matrix for each algorithm for algo in model.algolist: logger.info("Confusion Matrix for Algorithm: %s", algo) # get predictions for this partition y_pred = model.preds[(algo, partition)] # compute confusion matrix cm = confusion_matrix(y, y_pred) logger.info('Confusion Matrix:') logger.info('%s', cm) # normalize confusion matrix cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # initialize plot _, ax = plt.subplots() # set the title of the confusion matrix title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"]) plt.title(title) # only use the labels that appear in the data classes = unique_labels(y, y_pred) # show all ticks ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), xticklabels=classes, yticklabels=classes, title=title, ylabel='True Label', xlabel='Predicted Label') # rotate the tick labels and set their alignment plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # loop over data dimensions and create text annotations thresh = (cm.max() + cm.min()) / 2.0 for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") # show the color bar im = ax.imshow(cm, interpolation='nearest', cmap=cmap) ax.figure.colorbar(im, ax=ax) # save the chart tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
def plot_learning_curve(model, partition): r"""Generate learning curves for a given partition. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Learning Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] seed = model.specs['seed'] shuffle = model.specs['shuffle'] verbosity = model.specs['verbosity'] # Get original estimators estimators = get_estimators(model) # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Set cross-validation parameters to get mean train and test curves. cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed) # Plot a learning curve for each algorithm. ylim = (0.4, 1.01) for algo in model.algolist: logger.info("Learning Curve for Algorithm: %s", algo) # get estimator est = estimators[algo].estimator # plot learning curve title = BSEP.join([algo, "Learning Curve [", pstring, "]"]) # set up plot plt.style.use('classic') plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training Examples") plt.ylabel("Score") # call learning curve function train_sizes=np.linspace(0.1, 1.0, cv_folds) train_sizes, train_scores, test_scores = \ learning_curve(est, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, verbose=verbosity) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() # plot data plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-Validation Score") plt.legend(loc="lower right") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)
def get_text_features(fnum, fname, df, nvalues, vectorize, ngrams_max): r"""Transform text features with count vectorization and TF-IDF, or alternatively factorization. Parameters ---------- fnum : int Feature number, strictly for logging purposes fname : str Name of the text column in the dataframe ``df``. df : pandas.DataFrame Dataframe containing the column ``fname``. nvalues : int The number of unique values. vectorize : bool If ``True``, then attempt count vectorization. ngrams_max : int The maximum number of n-grams for count vectorization. Returns ------- new_features : numpy array The vectorized or factorized text features. new_fnames : list The new feature name(s) for the numerical variable. References ---------- To use count vectorization and TF-IDF, you can find more information here [TFE]_. """ feature = df[fname] min_length = int(feature.str.len().min()) max_length = int(feature.str.len().max()) if len(feature) == nvalues: logger.info( "Feature %d: %s is a text feature [%d:%d] with maximum number of values %d", fnum, fname, min_length, max_length, nvalues) else: logger.info( "Feature %d: %s is a text feature [%d:%d] with %d unique values", fnum, fname, min_length, max_length, nvalues) # need a null text placeholder for vectorization feature.fillna(value=NULLTEXT, inplace=True) # vectorization creates many columns, otherwise just factorize if vectorize: logger.info("Feature %d: %s => Attempting Vectorization", fnum, fname) vectorizer = TfidfVectorizer(ngram_range=[1, ngrams_max]) try: new_features = vectorizer.fit_transform(feature) new_fnames = vectorizer.get_feature_names() logger.info("Feature %d: %s => Vectorization Succeeded", fnum, fname) except: logger.info("Feature %d: %s => Vectorization Failed", fnum, fname) new_features, _ = pd.factorize(feature) new_fnames = [USEP.join([fname, 'factor'])] else: logger.info("Feature %d: %s => Factorization", fnum, fname) new_features, _ = pd.factorize(feature) new_fnames = [USEP.join([fname, 'factor'])] return new_features, new_fnames
def main(args=None): r"""The main program for SportFlow. Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the game configuration. (4) Get the model configuration. (5) Generate game frames for each season. (6) Create statistics for each team. (7) Merge the team frames into the final model frame. (8) Run the AlphaPy pipeline. Raises ------ ValueError Training date must be before prediction date. """ # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="sport_flow.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) logger = logging.getLogger(__name__) # Start the pipeline logger.info('*'*80) logger.info("SportFlow Start") logger.info('*'*80) # Argument Parsing parser = argparse.ArgumentParser(description="SportFlow Parser") parser.add_argument('--pdate', dest='predict_date', help="prediction date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_argument('--tdate', dest='train_date', help="training date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Set train and predict dates if args.train_date: train_date = args.train_date else: train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d") if args.predict_date: predict_date = args.predict_date else: predict_date = datetime.date.today().strftime("%Y-%m-%d") # Verify that the dates are in sequence. if train_date >= predict_date: raise ValueError("Training date must be before prediction date") else: logger.info("Training Date: %s", train_date) logger.info("Prediction Date: %s", predict_date) # Read game configuration file sport_specs = get_sport_config() # Section: game league = sport_specs['league'] points_max = sport_specs['points_max'] points_min = sport_specs['points_min'] random_scoring = sport_specs['random_scoring'] seasons = sport_specs['seasons'] window = sport_specs['rolling_window'] # Read model configuration file specs = get_model_config() # Add command line arguments to model specifications specs['predict_mode'] = args.predict_mode specs['predict_date'] = args.predict_date specs['train_date'] = args.train_date # Unpack model arguments directory = specs['directory'] target = specs['target'] # Create directories if necessary output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots'] for od in output_dirs: output_dir = SSEP.join([directory, od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create the game scores space space = Space('game', 'scores', '1g') # # Derived Variables # series = space.schema team1_prefix = 'home' team2_prefix = 'away' home_team = PSEP.join([team1_prefix, 'team']) away_team = PSEP.join([team2_prefix, 'team']) # # Read in the game frame. This is the feature generation phase. # logger.info("Reading Game Data") data_dir = SSEP.join([directory, 'data']) file_base = USEP.join([league, space.subject, space.schema, space.fractal]) df = read_frame(data_dir, file_base, specs['extension'], specs['separator']) logger.info("Total Game Records: %d", df.shape[0]) # # Locate any rows with null values # null_rows = df.isnull().any(axis=1) null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True] for i in null_indices: logger.info("Null Record: %d on Date: %s", i, df.date[i]) # # Run the game pipeline on a seasonal loop # if not seasons: # run model on all seasons seasons = df['season'].unique().tolist() # # Initialize the final frame # ff = pd.DataFrame() # # Iterate through each season of the game frame # for season in seasons: # Generate a frame for each season gf = df[df['season'] == season] gf = gf.reset_index() # Generate derived variables for the game frame total_games = gf.shape[0] if random_scoring: gf['home.score'] = np.random.randint(points_min, points_max, total_games) gf['away.score'] = np.random.randint(points_min, points_max, total_games) gf['total_points'] = gf['home.score'] + gf['away.score'] gf = add_features(gf, game_dict, gf.shape[0]) for index, row in gf.iterrows(): gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score') gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line'] gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under'] gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False # Generate each team frame team_frames = {} teams = gf.groupby([home_team]) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Generating team frame: %s", team_frame) tf = get_team_frame(gf, team, home_team, away_team) tf = tf.reset_index() tf = generate_team_frame(team, tf, home_team, away_team, window) team_frames[team_frame] = tf # Create the model frame, initializing the home and away frames mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool} team1_frame = pd.DataFrame() team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix) team2_frame = pd.DataFrame() team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix) frames = [gf, team1_frame, team2_frame] mf = pd.concat(frames, axis=1) # Loop through each team frame, inserting data into the model frame row # get index+1 [if valid] # determine if team is home or away to get prefix # try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0] # Assign team frame fields to respective model frame fields: set gf.at(pos, field) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Merging team frame %s into model frame", team_frame) tf = team_frames[team_frame] for index in range(0, tf.shape[0]-1): gindex = index + 1 model_row = tf.iloc[gindex] key_date = model_row['date'] at_home = False if team == model_row[home_team]: at_home = True key_team = model_row[home_team] elif team == model_row[away_team]: key_team = model_row[away_team] else: raise KeyError("Team %s not found in Team Frame" % team) try: if at_home: mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0] else: mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0] except: raise IndexError("Team/Date Key not found in Model Frame") # print team, gindex, mpos # insert team data into model row mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix) # Compute delta data 'home' - 'away' mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix) # Append this to final frame frames = [ff, mf] ff = pd.concat(frames) # Write out dataframes input_dir = SSEP.join([directory, 'input']) if args.predict_mode: new_predict_frame = ff.loc[ff.date >= predict_date] if len(new_predict_frame) <= 1: raise ValueError("Prediction frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving prediction frame") write_frame(new_predict_frame, input_dir, datasets[Partition.predict], specs['extension'], specs['separator']) else: # split data into training and test data new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)] if len(new_train_frame) <= 1: raise ValueError("Training frame has length 1 or less") new_test_frame = ff.loc[ff.date >= predict_date] if len(new_test_frame) <= 1: raise ValueError("Testing frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving training frame") write_frame(new_train_frame, input_dir, datasets[Partition.train], specs['extension'], specs['separator']) logger.info("Saving testing frame") write_frame(new_test_frame, input_dir, datasets[Partition.test], specs['extension'], specs['separator']) # Create the model from specs logger.info("Running Model") model = Model(specs) # Run the pipeline model = main_pipeline(model) # Complete the pipeline logger.info('*'*80) logger.info("SportFlow End") logger.info('*'*80)
def plot_importance(model, partition): r"""Display scikit-learn feature importances. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Feature Importance Plots") plot_dir = get_plot_directory(model) pstring = datasets[partition] # For each algorithm that has importances, generate the plot. n_top = 20 for algo in model.algolist: logger.info("Feature Importances for Algorithm: %s", algo) try: # get feature importances importances = np.array(model.importances[algo]) imp_flag = True except: imp_flag = False if imp_flag: # sort the importances by index indices = np.argsort(importances)[::-1] # get feature names feature_names = np.array(model.fnames_algo[algo]) n_features = len(feature_names) # log the feature ranking logger.info("Feature Ranking:") n_min = min(n_top, n_features) for i in range(n_min): logger.info("%d. %s (%f)" % (i + 1, feature_names[indices[i]], importances[indices[i]])) # plot the feature importances title = BSEP.join([algo, "Feature Importances [", pstring, "]"]) plt.figure() plt.title(title) plt.barh(range(n_min), importances[indices][:n_min][::-1]) plt.yticks(range(n_min), feature_names[indices][:n_min][::-1]) plt.ylim([-1, n_min]) plt.xlabel('Relative Importance') # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir) else: logger.info("No Feature Importances for %s" % algo)
def plot_confusion_matrix(model, partition): r"""Draw the confusion matrix. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix """ logger.info("Generating Confusion Matrices") plot_dir = get_plot_directory(model) pstring = datasets[partition] # For classification only if model.specs['model_type'] != ModelType.classification: logger.info('Confusion Matrix is for classification only') return None # Get X, Y for correct partition. X, y = get_partition_data(model, partition) for algo in model.algolist: logger.info("Confusion Matrix for Algorithm: %s", algo) # get predictions for this partition y_pred = model.preds[(algo, partition)] # compute confusion matrix cm = confusion_matrix(y, y_pred) logger.info('Confusion Matrix:') logger.info('%s', cm) # initialize plot np.set_printoptions(precision=2) plt.style.use('classic') plt.figure() # plot the confusion matrix cmap = plt.cm.Blues plt.imshow(cm, interpolation='nearest', cmap=cmap) title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"]) plt.title(title) plt.colorbar() # set up x and y axes y_values, y_counts = np.unique(y, return_counts=True) tick_marks = np.arange(len(y_values)) plt.xticks(tick_marks, y_values, rotation=45) plt.yticks(tick_marks, y_values) # normalize confusion matrix cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # place text in square of confusion matrix thresh = (cm.max() + cm.min()) / 2.0 for i, j in product(range(cm.shape[0]), range(cm.shape[1])): cmr = round(cmn[i, j], 3) plt.text(j, i, cmr, horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") # labels plt.tight_layout() plt.ylabel('True Label') plt.xlabel('Predicted Label') # save the chart tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'confusion', tag, plot_dir)