Exemple #1
0
def plot_roc_curve(model, partition):
    r"""Display ROC Curves with Cross-Validation.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc

    """

    logger.info("Generating ROC Curves")
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('ROC Curves are for classification only')
        return None

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Initialize plot parameters.

    plt.style.use('classic')
    plt.figure()
    lw = 2

    # Plot a ROC Curve for each algorithm.

    for algo in model.algolist:
        logger.info("ROC Curve for Algorithm: %s", algo)
        # compute ROC curve and ROC area for each class
        probas = model.probas[(algo, partition)]
        fpr, tpr, _ = roc_curve(y, probas)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, label='%s (area = %0.2f)' % (algo, roc_auc))

    # draw the luck line
    plt.plot([0, 1], [0, 1], linestyle='--', color='k', label='Luck')
    # define plot characteristics
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    title = BSEP.join([algo, "ROC Curve [", pstring, "]"])
    plt.title(title)
    plt.legend(loc="lower right")
    # save chart
    plot_dir = get_plot_directory(model)
    write_plot('matplotlib', plt, 'roc_curve', pstring, plot_dir)
Exemple #2
0
def plot_validation_curve(model, partition, pname, prange):
    r"""Generate scikit-learn validation curves.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.
    pname : str
        Name of the hyperparameter to test.
    prange : numpy array
        The values of the hyperparameter that will be evaluated.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py

    """

    logger.info("Generating Validation Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    scorer = model.specs['scorer']
    verbosity = model.specs['verbosity']

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Define plotting constants.

    spacing = 0.5
    alpha = 0.2

    # Calculate a validation curve for each algorithm.

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # get estimator
        estimator = model.estimators[algo]
        # set up plot
        train_scores, test_scores = validation_curve(estimator,
                                                     X,
                                                     y,
                                                     param_name=pname,
                                                     param_range=prange,
                                                     cv=cv_folds,
                                                     scoring=scorer,
                                                     n_jobs=n_jobs)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        # set up figure
        plt.style.use('classic')
        plt.figure()
        # plot learning curves
        title = BSEP.join([algo, "Validation Curve [", pstring, "]"])
        plt.title(title)
        # x-axis
        x_min, x_max = min(prange) - spacing, max(prange) + spacing
        plt.xlabel(pname)
        plt.xlim(x_min, x_max)
        # y-axis
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        # plot scores
        plt.plot(prange, train_scores_mean, label="Training Score", color="r")
        plt.fill_between(prange,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=alpha,
                         color="r")
        plt.plot(prange,
                 test_scores_mean,
                 label="Cross-Validation Score",
                 color="g")
        plt.fill_between(prange,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=alpha,
                         color="g")
        plt.legend(loc="best")  # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
Exemple #3
0
def plot_learning_curve(model, partition):
    r"""Generate learning curves for a given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Learning Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    shuffle = model.specs['shuffle']
    verbosity = model.specs['verbosity']

    # Get original estimators

    estimators = get_estimators(model)

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Set cross-validation parameters to get mean train and test curves.

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed)

    # Plot a learning curve for each algorithm.

    ylim = (0.4, 1.01)

    for algo in model.algolist:
        logger.info("Learning Curve for Algorithm: %s", algo)
        # get estimator
        est = estimators[algo].estimator
        # plot learning curve
        title = BSEP.join([algo, "Learning Curve [", pstring, "]"])
        # set up plot
        plt.style.use('classic')
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training Examples")
        plt.ylabel("Score")
        # call learning curve function
        train_sizes = np.linspace(0.1, 1.0, cv_folds)
        train_sizes, train_scores, test_scores = \
            learning_curve(est, X, y, train_sizes=train_sizes, cv=cv,
                           n_jobs=n_jobs, verbose=verbosity)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
        # plot data
        plt.fill_between(train_sizes,
                         train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std,
                         alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes,
                         test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std,
                         alpha=0.1,
                         color="g")
        plt.plot(train_sizes,
                 train_scores_mean,
                 'o-',
                 color="r",
                 label="Training Score")
        plt.plot(train_sizes,
                 test_scores_mean,
                 'o-',
                 color="g",
                 label="Cross-Validation Score")
        plt.legend(loc="lower right")
        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)
Exemple #4
0
def plot_importance(model, partition):
    r"""Display scikit-learn feature importances.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Feature Importance Plots")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For each algorithm that has importances, generate the plot.

    n_top = 20

    for algo in model.algolist:
        logger.info("Feature Importances for Algorithm: %s", algo)
        try:
            # get feature importances
            importances = np.array(model.importances[algo])
            imp_flag = True
        except:
            imp_flag = False
        if imp_flag:
            # sort the importances by index
            indices = np.argsort(importances)[::-1]
            # get feature names
            feature_names = np.array(model.fnames_algo[algo])
            n_features = len(feature_names)
            # log the feature ranking
            logger.info("Feature Ranking:")
            n_min = min(n_top, n_features)
            for i in range(n_min):
                logger.info("%d. %s (%f)" % (i + 1, feature_names[indices[i]],
                                             importances[indices[i]]))
            # plot the feature importances
            title = BSEP.join([algo, "Feature Importances [", pstring, "]"])
            plt.figure()
            plt.title(title)
            plt.barh(range(n_min), importances[indices][:n_min][::-1])
            plt.yticks(range(n_min), feature_names[indices][:n_min][::-1])
            plt.ylim([-1, n_min])
            plt.xlabel('Relative Importance')
            # save the plot
            tag = USEP.join([pstring, algo])
            write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir)
        else:
            logger.info("No Feature Importances for %s" % algo)
Exemple #5
0
def plot_candlestick(df, symbol, datecol='date', directory=None):
    r"""Plot time series data.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe containing the ``target`` feature.
    symbol : str
        Unique identifier of the data to plot.
    datecol : str, optional
        The name of the date column.
    directory : str, optional
        The full specification of the plot location.

    Returns
    -------
    None : None.

    Notes
    -----
    The dataframe ``df`` must contain these columns:

    * ``open``
    * ``high``
    * ``low``
    * ``close``

    References
    ----------

    http://bokeh.pydata.org/en/latest/docs/gallery/candlestick.html

    """

    df[datecol] = pd.to_datetime(df[datecol])

    mids = (df.open + df.close) / 2
    spans = abs(df.close - df.open)

    inc = df.close > df.open
    dec = df.open > df.close
    w = 12 * 60 * 60 * 1000  # half day in ms

    TOOLS = "pan, wheel_zoom, box_zoom, reset, save"

    p = figure(x_axis_type="datetime",
               tools=TOOLS,
               plot_width=1000,
               toolbar_location="left")

    p.title = BSEP.join([symbol.upper(), "Candlestick"])
    p.xaxis.major_label_orientation = math.pi / 4
    p.grid.grid_line_alpha = 0.3

    p.segment(df.date, df.high, df.date, df.low, color="black")
    p.rect(df.date[inc],
           mids[inc],
           w,
           spans[inc],
           fill_color="#D5E1DD",
           line_color="black")
    p.rect(df.date[dec],
           mids[dec],
           w,
           spans[dec],
           fill_color="#F2583E",
           line_color="black")

    # Save the plot
    write_plot('bokeh', p, 'candlestick_chart', symbol, directory)
Exemple #6
0
def plot_confusion_matrix(model, partition):
    r"""Draw the confusion matrix.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix

    """

    logger.info("Generating Confusion Matrices")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('Confusion Matrix is for classification only')
        return None

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    for algo in model.algolist:
        logger.info("Confusion Matrix for Algorithm: %s", algo)
        # get predictions for this partition
        y_pred = model.preds[(algo, partition)]
        # compute confusion matrix
        cm = confusion_matrix(y, y_pred)
        logger.info('Confusion Matrix:')
        logger.info('%s', cm)
        # initialize plot
        np.set_printoptions(precision=2)
        plt.style.use('classic')
        plt.figure()
        # plot the confusion matrix
        cmap = plt.cm.Blues
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"])
        plt.title(title)
        plt.colorbar()
        # set up x and y axes
        y_values, y_counts = np.unique(y, return_counts=True)
        tick_marks = np.arange(len(y_values))
        plt.xticks(tick_marks, y_values, rotation=45)
        plt.yticks(tick_marks, y_values)
        # normalize confusion matrix
        cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        # place text in square of confusion matrix
        thresh = (cm.max() + cm.min()) / 2.0
        for i, j in product(list(range(cm.shape[0])),
                            list(range(cm.shape[1]))):
            cmr = round(cmn[i, j], 3)
            plt.text(j,
                     i,
                     cmr,
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        # labels
        plt.tight_layout()
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        # save the chart
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
def vexec(f, v, vfuncs=None):
    r"""Add a variable to the given dataframe.

    This is the core function for adding a variable to a dataframe.
    The default variable functions are already defined locally
    in ``alphapy.var``; however, you may want to define your
    own variable functions. If so, then the ``vfuncs`` parameter
    will contain the list of modules and functions to be imported
    and applied by the ``vexec`` function.

    To write your own variable function, your function must have
    a pandas *DataFrame* as an input parameter and must return
    a pandas *Series* that represents the new variable.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe to contain the new variable.
    v : str
        Variable to add to the dataframe.
    vfuncs : dict, optional
        Dictionary of external modules and functions.

    Returns
    -------
    f : pandas.DataFrame
        Dataframe with the new variable.

    Other Parameters
    ----------------
    Variable.variables : dict
        Global dictionary of variables

    """
    vxlag, root, plist, lag = vparse(v)
    logger.debug("vexec : %s", v)
    logger.debug("vxlag : %s", vxlag)
    logger.debug("root  : %s", root)
    logger.debug("plist : %s", plist)
    logger.debug("lag   : %s", lag)
    if vxlag not in f.columns:
        if root in Variable.variables:
            logger.debug("Found variable %s: ", root)
            vroot = Variable.variables[root]
            expr = vroot.expr
            expr_new = vsub(vxlag, expr)
            estr = "%s" % expr_new
            estr = BSEP.join([vxlag, '=', estr])
            logger.debug("Expression: %s", estr)
            # pandas eval
            f.eval(estr, inplace=True)
        else:
            logger.debug("Did not find variable: %s", root)
            # Must be a function call
            func_name = root
            # Convert the parameter list and prepend the data frame
            newlist = []
            for p in plist:
                try:
                    newlist.append(int(p))
                except:
                    try:
                        newlist.append(float(p))
                    except:
                        newlist.append(p)
            newlist.insert(0, f)
            # Find the module and function
            module = None
            if vfuncs:
                for m in vfuncs:
                    funcs = vfuncs[m]
                    if func_name in funcs:
                        module = m
                        break
            # If the module was found, import the external treatment function,
            # else search the local namespace.
            if module:
                ext_module = import_module(module)
                func = getattr(my_module, func_name)
                # Create the variable by calling the function
                f[v] = func(*newlist)
            else:
                modname = globals()['__name__']
                module = sys.modules[modname]
                if func_name in dir(module):
                    func = getattr(module, func_name)
                    # Create the variable
                    f[v] = func(*newlist)
                else:
                    logger.debug("Could not find function %s", func_name)
    # if necessary, add the lagged variable
    if lag > 0 and vxlag in f.columns:
        f[v] = f[vxlag].shift(lag)
    # output frame
    return f