def plot_roc_curve(model, partition): r"""Display ROC Curves with Cross-Validation. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc """ logger.info("Generating ROC Curves") pstring = datasets[partition] # For classification only if model.specs['model_type'] != ModelType.classification: logger.info('ROC Curves are for classification only') return None # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Initialize plot parameters. plt.style.use('classic') plt.figure() lw = 2 # Plot a ROC Curve for each algorithm. for algo in model.algolist: logger.info("ROC Curve for Algorithm: %s", algo) # compute ROC curve and ROC area for each class probas = model.probas[(algo, partition)] fpr, tpr, _ = roc_curve(y, probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=lw, label='%s (area = %0.2f)' % (algo, roc_auc)) # draw the luck line plt.plot([0, 1], [0, 1], linestyle='--', color='k', label='Luck') # define plot characteristics plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') title = BSEP.join([algo, "ROC Curve [", pstring, "]"]) plt.title(title) plt.legend(loc="lower right") # save chart plot_dir = get_plot_directory(model) write_plot('matplotlib', plt, 'roc_curve', pstring, plot_dir)
def plot_validation_curve(model, partition, pname, prange): r"""Generate scikit-learn validation curves. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. pname : str Name of the hyperparameter to test. prange : numpy array The values of the hyperparameter that will be evaluated. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py """ logger.info("Generating Validation Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] scorer = model.specs['scorer'] verbosity = model.specs['verbosity'] # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Define plotting constants. spacing = 0.5 alpha = 0.2 # Calculate a validation curve for each algorithm. for algo in model.algolist: logger.info("Algorithm: %s", algo) # get estimator estimator = model.estimators[algo] # set up plot train_scores, test_scores = validation_curve(estimator, X, y, param_name=pname, param_range=prange, cv=cv_folds, scoring=scorer, n_jobs=n_jobs) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) # set up figure plt.style.use('classic') plt.figure() # plot learning curves title = BSEP.join([algo, "Validation Curve [", pstring, "]"]) plt.title(title) # x-axis x_min, x_max = min(prange) - spacing, max(prange) + spacing plt.xlabel(pname) plt.xlim(x_min, x_max) # y-axis plt.ylabel("Score") plt.ylim(0.0, 1.1) # plot scores plt.plot(prange, train_scores_mean, label="Training Score", color="r") plt.fill_between(prange, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=alpha, color="r") plt.plot(prange, test_scores_mean, label="Cross-Validation Score", color="g") plt.fill_between(prange, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=alpha, color="g") plt.legend(loc="best") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
def plot_learning_curve(model, partition): r"""Generate learning curves for a given partition. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Learning Curves") plot_dir = get_plot_directory(model) pstring = datasets[partition] # Extract model parameters. cv_folds = model.specs['cv_folds'] n_jobs = model.specs['n_jobs'] seed = model.specs['seed'] shuffle = model.specs['shuffle'] verbosity = model.specs['verbosity'] # Get original estimators estimators = get_estimators(model) # Get X, Y for correct partition. X, y = get_partition_data(model, partition) # Set cross-validation parameters to get mean train and test curves. cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed) # Plot a learning curve for each algorithm. ylim = (0.4, 1.01) for algo in model.algolist: logger.info("Learning Curve for Algorithm: %s", algo) # get estimator est = estimators[algo].estimator # plot learning curve title = BSEP.join([algo, "Learning Curve [", pstring, "]"]) # set up plot plt.style.use('classic') plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training Examples") plt.ylabel("Score") # call learning curve function train_sizes = np.linspace(0.1, 1.0, cv_folds) train_sizes, train_scores, test_scores = \ learning_curve(est, X, y, train_sizes=train_sizes, cv=cv, n_jobs=n_jobs, verbose=verbosity) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() # plot data plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-Validation Score") plt.legend(loc="lower right") # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)
def plot_importance(model, partition): r"""Display scikit-learn feature importances. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html """ logger.info("Generating Feature Importance Plots") plot_dir = get_plot_directory(model) pstring = datasets[partition] # For each algorithm that has importances, generate the plot. n_top = 20 for algo in model.algolist: logger.info("Feature Importances for Algorithm: %s", algo) try: # get feature importances importances = np.array(model.importances[algo]) imp_flag = True except: imp_flag = False if imp_flag: # sort the importances by index indices = np.argsort(importances)[::-1] # get feature names feature_names = np.array(model.fnames_algo[algo]) n_features = len(feature_names) # log the feature ranking logger.info("Feature Ranking:") n_min = min(n_top, n_features) for i in range(n_min): logger.info("%d. %s (%f)" % (i + 1, feature_names[indices[i]], importances[indices[i]])) # plot the feature importances title = BSEP.join([algo, "Feature Importances [", pstring, "]"]) plt.figure() plt.title(title) plt.barh(range(n_min), importances[indices][:n_min][::-1]) plt.yticks(range(n_min), feature_names[indices][:n_min][::-1]) plt.ylim([-1, n_min]) plt.xlabel('Relative Importance') # save the plot tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir) else: logger.info("No Feature Importances for %s" % algo)
def plot_candlestick(df, symbol, datecol='date', directory=None): r"""Plot time series data. Parameters ---------- df : pandas.DataFrame The dataframe containing the ``target`` feature. symbol : str Unique identifier of the data to plot. datecol : str, optional The name of the date column. directory : str, optional The full specification of the plot location. Returns ------- None : None. Notes ----- The dataframe ``df`` must contain these columns: * ``open`` * ``high`` * ``low`` * ``close`` References ---------- http://bokeh.pydata.org/en/latest/docs/gallery/candlestick.html """ df[datecol] = pd.to_datetime(df[datecol]) mids = (df.open + df.close) / 2 spans = abs(df.close - df.open) inc = df.close > df.open dec = df.open > df.close w = 12 * 60 * 60 * 1000 # half day in ms TOOLS = "pan, wheel_zoom, box_zoom, reset, save" p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=1000, toolbar_location="left") p.title = BSEP.join([symbol.upper(), "Candlestick"]) p.xaxis.major_label_orientation = math.pi / 4 p.grid.grid_line_alpha = 0.3 p.segment(df.date, df.high, df.date, df.low, color="black") p.rect(df.date[inc], mids[inc], w, spans[inc], fill_color="#D5E1DD", line_color="black") p.rect(df.date[dec], mids[dec], w, spans[dec], fill_color="#F2583E", line_color="black") # Save the plot write_plot('bokeh', p, 'candlestick_chart', symbol, directory)
def plot_confusion_matrix(model, partition): r"""Draw the confusion matrix. Parameters ---------- model : alphapy.Model The model object with plotting specifications. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None References ---------- http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix """ logger.info("Generating Confusion Matrices") plot_dir = get_plot_directory(model) pstring = datasets[partition] # For classification only if model.specs['model_type'] != ModelType.classification: logger.info('Confusion Matrix is for classification only') return None # Get X, Y for correct partition. X, y = get_partition_data(model, partition) for algo in model.algolist: logger.info("Confusion Matrix for Algorithm: %s", algo) # get predictions for this partition y_pred = model.preds[(algo, partition)] # compute confusion matrix cm = confusion_matrix(y, y_pred) logger.info('Confusion Matrix:') logger.info('%s', cm) # initialize plot np.set_printoptions(precision=2) plt.style.use('classic') plt.figure() # plot the confusion matrix cmap = plt.cm.Blues plt.imshow(cm, interpolation='nearest', cmap=cmap) title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"]) plt.title(title) plt.colorbar() # set up x and y axes y_values, y_counts = np.unique(y, return_counts=True) tick_marks = np.arange(len(y_values)) plt.xticks(tick_marks, y_values, rotation=45) plt.yticks(tick_marks, y_values) # normalize confusion matrix cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # place text in square of confusion matrix thresh = (cm.max() + cm.min()) / 2.0 for i, j in product(list(range(cm.shape[0])), list(range(cm.shape[1]))): cmr = round(cmn[i, j], 3) plt.text(j, i, cmr, horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") # labels plt.tight_layout() plt.ylabel('True Label') plt.xlabel('Predicted Label') # save the chart tag = USEP.join([pstring, algo]) write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
def vexec(f, v, vfuncs=None): r"""Add a variable to the given dataframe. This is the core function for adding a variable to a dataframe. The default variable functions are already defined locally in ``alphapy.var``; however, you may want to define your own variable functions. If so, then the ``vfuncs`` parameter will contain the list of modules and functions to be imported and applied by the ``vexec`` function. To write your own variable function, your function must have a pandas *DataFrame* as an input parameter and must return a pandas *Series* that represents the new variable. Parameters ---------- f : pandas.DataFrame Dataframe to contain the new variable. v : str Variable to add to the dataframe. vfuncs : dict, optional Dictionary of external modules and functions. Returns ------- f : pandas.DataFrame Dataframe with the new variable. Other Parameters ---------------- Variable.variables : dict Global dictionary of variables """ vxlag, root, plist, lag = vparse(v) logger.debug("vexec : %s", v) logger.debug("vxlag : %s", vxlag) logger.debug("root : %s", root) logger.debug("plist : %s", plist) logger.debug("lag : %s", lag) if vxlag not in f.columns: if root in Variable.variables: logger.debug("Found variable %s: ", root) vroot = Variable.variables[root] expr = vroot.expr expr_new = vsub(vxlag, expr) estr = "%s" % expr_new estr = BSEP.join([vxlag, '=', estr]) logger.debug("Expression: %s", estr) # pandas eval f.eval(estr, inplace=True) else: logger.debug("Did not find variable: %s", root) # Must be a function call func_name = root # Convert the parameter list and prepend the data frame newlist = [] for p in plist: try: newlist.append(int(p)) except: try: newlist.append(float(p)) except: newlist.append(p) newlist.insert(0, f) # Find the module and function module = None if vfuncs: for m in vfuncs: funcs = vfuncs[m] if func_name in funcs: module = m break # If the module was found, import the external treatment function, # else search the local namespace. if module: ext_module = import_module(module) func = getattr(my_module, func_name) # Create the variable by calling the function f[v] = func(*newlist) else: modname = globals()['__name__'] module = sys.modules[modname] if func_name in dir(module): func = getattr(module, func_name) # Create the variable f[v] = func(*newlist) else: logger.debug("Could not find function %s", func_name) # if necessary, add the lagged variable if lag > 0 and vxlag in f.columns: f[v] = f[vxlag].shift(lag) # output frame return f