Esempio n. 1
0
def split_to_letters(f, c):
    r"""Separate text into distinct characters.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe containing the column ``c``.
    c : str
        Name of the text column in the dataframe ``f``.

    Returns
    -------
    new_feature : pandas.Series
        The array containing the new feature.

    Example
    -------
    The value 'abc' becomes 'a b c'.

    """
    fc = f[c]
    new_feature = None
    dtype = fc.dtypes
    if dtype == 'object':
        fc.fillna(NULLTEXT, inplace=True)
        maxlen = fc.astype(str).str.len().max()
        if maxlen > 1:
            new_feature = fc.apply(lambda x: BSEP.join(list(x)))
    return new_feature
Esempio n. 2
0
def plot_importance(model, partition):
    r"""Display scikit-learn feature importances.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Feature Importance Plots")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Get X, Y for correct partition

    X, y = get_partition_data(model, partition)

    # For each algorithm that has importances, generate the plot.

    n_top = 10
    for algo in model.algolist:
        logger.info("Feature Importances for Algorithm: %s", algo)
        try:
            importances = model.importances[algo]
            # forest was input parameter
            indices = np.argsort(importances)[::-1]
            # log the feature ranking
            logger.info("Feature Ranking:")
            for f in range(n_top):
                logger.info("%d. Feature %d (%f)" %
                            (f + 1, indices[f], importances[indices[f]]))
            # plot the feature importances
            title = BSEP.join([algo, "Feature Importances [", pstring, "]"])
            plt.style.use('classic')
            plt.figure()
            plt.title(title)
            plt.bar(range(n_top),
                    importances[indices][:n_top],
                    color="b",
                    align="center")
            plt.xticks(range(n_top), indices[:n_top])
            plt.xlim([-1, n_top])
            # save the plot
            tag = USEP.join([pstring, algo])
            write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir)
        except:
            logger.info("%s does not have feature importances", algo)
Esempio n. 3
0
def plot_candlestick(df, symbol, datecol='date', directory=None):
    r"""Plot time series data.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe containing the ``target`` feature.
    symbol : str
        Unique identifier of the data to plot.
    datecol : str, optional
        The name of the date column.
    directory : str, optional
        The full specification of the plot location.

    Returns
    -------
    None : None.

    Notes
    -----
    The dataframe ``df`` must contain these columns:

    * ``open``
    * ``high``
    * ``low``
    * ``close``

    References
    ----------

    http://bokeh.pydata.org/en/latest/docs/gallery/candlestick.html

    """

    df[datecol] = pd.to_datetime(df[datecol])

    mids = (df.open + df.close) / 2
    spans = abs(df.close - df.open)

    inc = df.close > df.open
    dec = df.open > df.close
    w = 12 * 60 * 60 * 1000 # half day in ms

    TOOLS = "pan, wheel_zoom, box_zoom, reset, save"

    p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=1000, toolbar_location="left")

    p.title = BSEP.join([symbol.upper(), "Candlestick"])
    p.xaxis.major_label_orientation = math.pi / 4
    p.grid.grid_line_alpha = 0.3

    p.segment(df.date, df.high, df.date, df.low, color="black")
    p.rect(df.date[inc], mids[inc], w, spans[inc], fill_color="#D5E1DD", line_color="black")
    p.rect(df.date[dec], mids[dec], w, spans[dec], fill_color="#F2583E", line_color="black")

    # Save the plot
    write_plot('bokeh', p, 'candlestick_chart', symbol, directory)
Esempio n. 4
0
def plot_validation_curve(model, partition, pname, prange):
    r"""Generate scikit-learn validation curves.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.
    pname : str
        Name of the hyperparameter to test.
    prange : numpy array
        The values of the hyperparameter that will be evaluated.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py

    """

    logger.info("Generating Validation Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    scorer = model.specs['scorer']
    verbosity = model.specs['verbosity']

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Define plotting constants.

    spacing = 0.5
    alpha = 0.2

    # Calculate a validation curve for each algorithm.

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # get estimator
        estimator = model.estimators[algo]
        # set up plot
        train_scores, test_scores = validation_curve(
            estimator, X, y, param_name=pname, param_range=prange,
            cv=cv_folds, scoring=scorer, n_jobs=n_jobs)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        # set up figure
        plt.style.use('classic')
        plt.figure()
        # plot learning curves
        title = BSEP.join([algo, "Validation Curve [", pstring, "]"])
        plt.title(title)
        # x-axis
        x_min, x_max = min(prange) - spacing, max(prange) + spacing
        plt.xlabel(pname)
        plt.xlim(x_min, x_max)
        # y-axis
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        # plot scores
        plt.plot(prange, train_scores_mean, label="Training Score", color="r")
        plt.fill_between(prange, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=alpha, color="r")
        plt.plot(prange, test_scores_mean, label="Cross-Validation Score",
                 color="g")
        plt.fill_between(prange, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=alpha, color="g")
        plt.legend(loc="best")        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
Esempio n. 5
0
def plot_confusion_matrix(model, partition):
    r"""Draw the confusion matrix.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix

    """

    logger.info("Generating Confusion Matrices")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('Confusion Matrix is for classification only')
        return None

    # Get X, Y for correct partition.
    X, y = get_partition_data(model, partition)

    # Plot Parameters
    np.set_printoptions(precision=2)
    cmap = plt.cm.Blues
    fmt = '.2f'

    # Generate a Confusion Matrix for each algorithm

    for algo in model.algolist:
        logger.info("Confusion Matrix for Algorithm: %s", algo)

        # get predictions for this partition
        y_pred = model.preds[(algo, partition)]

        # compute confusion matrix
        cm = confusion_matrix(y, y_pred)
        logger.info('Confusion Matrix:')
        logger.info('%s', cm)

        # normalize confusion matrix
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        # initialize plot
        _, ax = plt.subplots()

        # set the title of the confusion matrix
        title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"])
        plt.title(title)

        # only use the labels that appear in the data
        classes = unique_labels(y, y_pred)

        # show all ticks
        ax.set(xticks=np.arange(cm.shape[1]),
            yticks=np.arange(cm.shape[0]),
            xticklabels=classes, yticklabels=classes,
            title=title,
            ylabel='True Label',
            xlabel='Predicted Label')

        # rotate the tick labels and set their alignment
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                rotation_mode="anchor")

        # loop over data dimensions and create text annotations
        thresh = (cm.max() + cm.min()) / 2.0
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")

        # show the color bar
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.figure.colorbar(im, ax=ax)

        # save the chart
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
Esempio n. 6
0
def plot_roc_curve(model, partition):
    r"""Display ROC Curves with Cross-Validation.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc

    """

    logger.info("Generating ROC Curves")
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('ROC Curves are for classification only')
        return None

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Initialize plot parameters.

    plt.style.use('classic')
    plt.figure()
    colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
    lw = 2

    # Plot a ROC Curve for each algorithm.

    for algo in model.algolist:
        logger.info("ROC Curve for Algorithm: %s", algo)
        # get estimator
        estimator = model.estimators[algo]
        # compute ROC curve and ROC area for each class
        probas = model.probas[(algo, partition)]
        fpr, tpr, _ = roc_curve(y, probas)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, label='%s (area = %0.2f)' % (algo, roc_auc))

    # draw the luck line
    plt.plot([0, 1], [0, 1], linestyle='--', color='k', label='Luck')
    # define plot characteristics
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    title = BSEP.join([algo, "ROC Curve [", pstring, "]"])
    plt.title(title)
    plt.legend(loc="lower right")
    # save chart
    plot_dir = get_plot_directory(model)
    write_plot('matplotlib', plt, 'roc_curve', pstring, plot_dir)
Esempio n. 7
0
def plot_learning_curve(model, partition):
    r"""Generate learning curves for a given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Learning Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    shuffle = model.specs['shuffle']
    verbosity = model.specs['verbosity']

    # Get original estimators

    estimators = get_estimators(model)

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Set cross-validation parameters to get mean train and test curves.

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed)

    # Plot a learning curve for each algorithm.

    ylim = (0.4, 1.01)

    for algo in model.algolist:
        logger.info("Learning Curve for Algorithm: %s", algo)
        # get estimator
        est = estimators[algo].estimator
        # plot learning curve
        title = BSEP.join([algo, "Learning Curve [", pstring, "]"])
        # set up plot
        plt.style.use('classic')
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training Examples")
        plt.ylabel("Score")
        # call learning curve function
        train_sizes=np.linspace(0.1, 1.0, cv_folds)
        train_sizes, train_scores, test_scores = \
            learning_curve(est, X, y, train_sizes=train_sizes, cv=cv,
                           n_jobs=n_jobs, verbose=verbosity)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
        # plot data
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training Score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-Validation Score")
        plt.legend(loc="lower right")
        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)
Esempio n. 8
0
def vexec(f, v, vfuncs=None):
    r"""Add a variable to the given dataframe.

    This is the core function for adding a variable to a dataframe.
    The default variable functions are already defined locally
    in ``alphapy.var``; however, you may want to define your
    own variable functions. If so, then the ``vfuncs`` parameter
    will contain the list of modules and functions to be imported
    and applied by the ``vexec`` function.

    To write your own variable function, your function must have
    a pandas *DataFrame* as an input parameter and must return
    a pandas *Series* that represents the new variable.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe to contain the new variable.
    v : str
        Variable to add to the dataframe.
    vfuncs : dict, optional
        Dictionary of external modules and functions.

    Returns
    -------
    f : pandas.DataFrame
        Dataframe with the new variable.

    Other Parameters
    ----------------
    Variable.variables : dict
        Global dictionary of variables

    """
    vxlag, root, plist, lag = vparse(v)
    logger.debug("vexec : %s", v)
    logger.debug("vxlag : %s", vxlag)
    logger.debug("root  : %s", root)
    logger.debug("plist : %s", plist)
    logger.debug("lag   : %s", lag)
    if vxlag not in f.columns:
        if root in Variable.variables:
            logger.debug("Found variable %s: ", root)
            vroot = Variable.variables[root]
            expr = vroot.expr
            expr_new = vsub(vxlag, expr)
            estr = "%s" % expr_new
            estr = BSEP.join([vxlag, '=', estr])
            logger.debug("Expression: %s", estr)
            # pandas eval
            f.eval(estr, inplace=True)
        else:
            logger.debug("Did not find variable: %s", root)
            # Must be a function call
            func_name = root
            # Convert the parameter list and prepend the data frame
            newlist = []
            for p in plist:
                try:
                    newlist.append(int(p))
                except:
                    try:
                        newlist.append(float(p))
                    except:
                        newlist.append(p)
            newlist.insert(0, f)
            # Find the module and function
            module = None
            if vfuncs:
                for m in vfuncs:
                    funcs = vfuncs[m]
                    if func_name in funcs:
                        module = m
                        break
            # If the module was found, import the external treatment function,
            # else search the local namespace.
            if module:
                ext_module = import_module(module)
                func = getattr(my_module, func_name)
                # Create the variable by calling the function
                f[v] = func(*newlist)
            else:
                modname = globals()['__name__']
                module = sys.modules[modname]
                if func_name in dir(module):
                    func = getattr(module, func_name)
                    # Create the variable
                    f[v] = func(*newlist)
                else:
                    logger.debug("Could not find function %s", func_name)
    # if necessary, add the lagged variable
    if lag > 0 and vxlag in f.columns:
        f[v] = f[vxlag].shift(lag)
    # output frame
    return f
Esempio n. 9
0
def plot_importance(model, partition):
    r"""Display scikit-learn feature importances.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Feature Importance Plots")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For each algorithm that has importances, generate the plot.

    n_top = 20

    for algo in model.algolist:
        logger.info("Feature Importances for Algorithm: %s", algo)
        try:
            # get feature importances
            importances = np.array(model.importances[algo])
            imp_flag = True
        except:
            imp_flag = False
        if imp_flag:
            # sort the importances by index
            indices = np.argsort(importances)[::-1]
            # get feature names
            feature_names = np.array(model.fnames_algo[algo])
            n_features = len(feature_names)
            # log the feature ranking
            logger.info("Feature Ranking:")
            n_min = min(n_top, n_features)
            for i in range(n_min):
                logger.info("%d. %s (%f)" % (i + 1, feature_names[indices[i]],
                                             importances[indices[i]]))
            # plot the feature importances
            title = BSEP.join([algo, "Feature Importances [", pstring, "]"])
            plt.figure()
            plt.title(title)
            plt.barh(range(n_min), importances[indices][:n_min][::-1])
            plt.yticks(range(n_min), feature_names[indices][:n_min][::-1])
            plt.ylim([-1, n_min])
            plt.xlabel('Relative Importance')
            # save the plot
            tag = USEP.join([pstring, algo])
            write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir)
        else:
            logger.info("No Feature Importances for %s" % algo)
Esempio n. 10
0
def plot_confusion_matrix(model, partition):
    r"""Draw the confusion matrix.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix

    """

    logger.info("Generating Confusion Matrices")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('Confusion Matrix is for classification only')
        return None

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    for algo in model.algolist:
        logger.info("Confusion Matrix for Algorithm: %s", algo)
        # get predictions for this partition
        y_pred = model.preds[(algo, partition)]
        # compute confusion matrix
        cm = confusion_matrix(y, y_pred)
        logger.info('Confusion Matrix:')
        logger.info('%s', cm)
        # initialize plot
        np.set_printoptions(precision=2)
        plt.style.use('classic')
        plt.figure()
        # plot the confusion matrix
        cmap = plt.cm.Blues
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"])
        plt.title(title)
        plt.colorbar()
        # set up x and y axes
        y_values, y_counts = np.unique(y, return_counts=True)
        tick_marks = np.arange(len(y_values))
        plt.xticks(tick_marks, y_values, rotation=45)
        plt.yticks(tick_marks, y_values)
        # normalize confusion matrix
        cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        # place text in square of confusion matrix
        thresh = (cm.max() + cm.min()) / 2.0
        for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
            cmr = round(cmn[i, j], 3)
            plt.text(j,
                     i,
                     cmr,
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        # labels
        plt.tight_layout()
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        # save the chart
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'confusion', tag, plot_dir)