예제 #1
0
파일: features.py 프로젝트: raaraa/AlphaPy
def get_text_features(fnum, fname, df, nvalues, vectorize, ngrams_max):
    r"""Transform text features with count vectorization and TF-IDF,
    or alternatively factorization.

    Parameters
    ----------
    fnum : int
        Feature number, strictly for logging purposes
    fname : str
        Name of the text column in the dataframe ``df``.
    df : pandas.DataFrame
        Dataframe containing the column ``fname``.
    nvalues : int
        The number of unique values.
    vectorize : bool
        If ``True``, then attempt count vectorization.
    ngrams_max : int
        The maximum number of n-grams for count vectorization.

    Returns
    -------
    new_features : numpy array
        The vectorized or factorized text features.
    new_fnames : list
        The new feature name(s) for the numerical variable.

    References
    ----------
    To use count vectorization and TF-IDF, you can find more
    information here [TFE]_.

    """
    feature = df[fname]
    min_length = int(feature.astype(str).str.len().min())
    max_length = int(feature.astype(str).str.len().max())
    if len(feature) == nvalues:
        logger.info("Feature %d: %s is a text feature [%d:%d] with maximum number of values %d",
                    fnum, fname, min_length, max_length, nvalues)
    else:
        logger.info("Feature %d: %s is a text feature [%d:%d] with %d unique values",
                    fnum, fname, min_length, max_length, nvalues)
    # need a null text placeholder for vectorization
    feature.fillna(value=NULLTEXT, inplace=True)
    # vectorization creates many columns, otherwise just factorize
    if vectorize:
        logger.info("Feature %d: %s => Attempting Vectorization", fnum, fname)
        vectorizer = TfidfVectorizer(ngram_range=[1, ngrams_max])
        try:
            new_features = vectorizer.fit_transform(feature)
            new_fnames = vectorizer.get_feature_names()
            logger.info("Feature %d: %s => Vectorization Succeeded", fnum, fname)
        except:
            logger.info("Feature %d: %s => Vectorization Failed", fnum, fname)
            new_features, _ = pd.factorize(feature)
            new_fnames = [USEP.join([fname, 'factor'])]
    else:
        logger.info("Feature %d: %s => Factorization", fnum, fname)
        new_features, _ = pd.factorize(feature)
        new_fnames = [USEP.join([fname, 'factor'])]
    return new_features, new_fnames
예제 #2
0
def diplus(f, p=14):
    r"""Calculate the Plus Directional Indicator (+DI).

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe with columns ``high`` and ``low``.
    p : int
        The period over which to calculate the +DI.

    Returns
    -------
    new_column : pandas.Series (float)
        The array containing the new feature.

    References
    ----------
    *A component of the average directional index (ADX) that is used to
    measure the presence of an uptrend. When the +DI is sloping upward,
    it is a signal that the uptrend is getting stronger* [IP_PDI]_.

    .. [IP_PDI] http://www.investopedia.com/terms/p/positivedirectionalindicator.asp

    """
    tr = 'truerange'
    vexec(f, tr)
    atr = USEP.join(['atr', str(p)])
    vexec(f, atr)
    dmp = 'dmplus'
    vexec(f, dmp)
    new_column = 100 * f[dmp].ewm(span=p).mean() / f[atr]
    return new_column
예제 #3
0
def plot_importance(model, partition):
    r"""Display scikit-learn feature importances.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Feature Importance Plots")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Get X, Y for correct partition

    X, y = get_partition_data(model, partition)

    # For each algorithm that has importances, generate the plot.

    n_top = 10
    for algo in model.algolist:
        logger.info("Feature Importances for Algorithm: %s", algo)
        try:
            importances = model.importances[algo]
            # forest was input parameter
            indices = np.argsort(importances)[::-1]
            # log the feature ranking
            logger.info("Feature Ranking:")
            for f in range(n_top):
                logger.info("%d. Feature %d (%f)" %
                            (f + 1, indices[f], importances[indices[f]]))
            # plot the feature importances
            title = BSEP.join([algo, "Feature Importances [", pstring, "]"])
            plt.style.use('classic')
            plt.figure()
            plt.title(title)
            plt.bar(range(n_top),
                    importances[indices][:n_top],
                    color="b",
                    align="center")
            plt.xticks(range(n_top), indices[:n_top])
            plt.xlim([-1, n_top])
            # save the plot
            tag = USEP.join([pstring, algo])
            write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir)
        except:
            logger.info("%s does not have feature importances", algo)
예제 #4
0
파일: features.py 프로젝트: raaraa/AlphaPy
def create_clusters(features, model):
    r"""Cluster the given features.

    Parameters
    ----------
    features : numpy array
        The features to cluster.
    model : alphapy.Model
        The model object with the clustering parameters.

    Returns
    -------
    cfeatures : numpy array
        The calculated clusters.
    cnames : list
        The cluster feature names.

    References
    ----------
    You can find more information on clustering here [CLUS]_.

    .. [CLUS] http://scikit-learn.org/stable/modules/clustering.html

    """

    logger.info("Creating Clustering Features")

    # Extract model parameters

    cluster_inc = model.specs['cluster_inc']
    cluster_max = model.specs['cluster_max']
    cluster_min = model.specs['cluster_min']
    seed = model.specs['seed']

    # Log model parameters

    logger.info("Cluster Minimum   : %d", cluster_min)
    logger.info("Cluster Maximum   : %d", cluster_max)
    logger.info("Cluster Increment : %d", cluster_inc)

    # Generate clustering features

    cfeatures = np.zeros((features.shape[0], 1))
    cnames = []
    for i in range(cluster_min, cluster_max+1, cluster_inc):
        logger.info("k = %d", i)
        km = MiniBatchKMeans(n_clusters=i, random_state=seed)
        km.fit(features)
        labels = km.predict(features)
        labels = labels.reshape(-1, 1)
        cfeatures = np.column_stack((cfeatures, labels))
        cnames.append(USEP.join(['cluster', str(i)]))
    cfeatures = np.delete(cfeatures, 0, axis=1)

    # Return new clustering features

    logger.info("Clustering Feature Count : %d", cfeatures.shape[1])
    return cfeatures, cnames
예제 #5
0
파일: features.py 프로젝트: rydeen7/AlphaPy
def create_isomap_features(features, model):
    r"""Create Isomap features.

    Parameters
    ----------
    features : numpy array
        The input features.
    model : alphapy.Model
        The model object with the Isomap parameters.

    Returns
    -------
    ifeatures : numpy array
        The Isomap features.
    inames : list
        The Isomap feature names.

    Notes
    -----

    Isomaps are very memory-intensive. Your process will be killed
    if you run out of memory.

    References
    ----------
    You can find more information on Principal Component Analysis here [ISO]_.

    .. [ISO] http://scikit-learn.org/stable/modules/manifold.html#isomap

    """

    logger.info("Creating Isomap Features")

    # Extract model parameters

    iso_components = model.specs['iso_components']
    iso_neighbors = model.specs['iso_neighbors']
    n_jobs = model.specs['n_jobs']

    # Log model parameters

    logger.info("Isomap Components : %d", iso_components)
    logger.info("Isomap Neighbors  : %d", iso_neighbors)

    # Generate Isomap features

    model = Isomap(n_neighbors=iso_neighbors,
                   n_components=iso_components,
                   n_jobs=n_jobs)
    ifeatures = model.fit_transform(features)
    inames = [USEP.join(['isomap', str(i + 1)]) for i in range(iso_components)]

    # Return new Isomap features

    logger.info("Isomap Feature Count : %d", ifeatures.shape[1])
    return ifeatures, inames
예제 #6
0
파일: features.py 프로젝트: raaraa/AlphaPy
def create_pca_features(features, model):
    r"""Apply Principal Component Analysis (PCA) to the features.

    Parameters
    ----------
    features : numpy array
        The input features.
    model : alphapy.Model
        The model object with the PCA parameters.

    Returns
    -------
    pfeatures : numpy array
        The PCA features.
    pnames : list
        The PCA feature names.

    References
    ----------
    You can find more information on Principal Component Analysis here [PCA]_.

    .. [PCA] http://scikit-learn.org/stable/modules/decomposition.html#pca

    """

    logger.info("Creating PCA Features")

    # Extract model parameters

    pca_inc = model.specs['pca_inc']
    pca_max = model.specs['pca_max']
    pca_min = model.specs['pca_min']
    pca_whiten = model.specs['pca_whiten']

    # Log model parameters

    logger.info("PCA Minimum   : %d", pca_min)
    logger.info("PCA Maximum   : %d", pca_max)
    logger.info("PCA Increment : %d", pca_inc)
    logger.info("PCA Whitening : %r", pca_whiten)

    # Generate clustering features

    pfeatures = np.zeros((features.shape[0], 1))
    pnames = []
    for i in range(pca_min, pca_max+1, pca_inc):
        logger.info("n_components = %d", i)
        X_pca = PCA(n_components=i, whiten=pca_whiten).fit_transform(features)
        pfeatures = np.column_stack((pfeatures, X_pca))
        pnames.append(USEP.join(['pca', str(i)]))
    pfeatures = np.delete(pfeatures, 0, axis=1)

    # Return new clustering features

    logger.info("PCA Feature Count : %d", pfeatures.shape[1])
    return pfeatures, pnames
예제 #7
0
파일: features.py 프로젝트: rydeen7/AlphaPy
def get_numerical_features(fnum, fname, df, nvalues, dt, sentinel, logt,
                           plevel):
    r"""Transform numerical features with imputation and possibly
    log-transformation.

    Parameters
    ----------
    fnum : int
        Feature number, strictly for logging purposes
    fname : str
        Name of the numerical column in the dataframe ``df``.
    df : pandas.DataFrame
        Dataframe containing the column ``fname``.
    nvalues : int
        The number of unique values.
    dt : str
        The values ``'float64'``, ``'int64'``, or ``'bool'``.
    sentinel : float
        The number to be imputed for NaN values.
    logt : bool
        If ``True``, then log-transform numerical values.
    plevel : float
        The p-value threshold to test if a feature is normally distributed.

    Returns
    -------
    new_values : numpy array
        The set of imputed and transformed features.
    new_fnames : list
        The new feature name(s) for the numerical variable.

    """
    feature = df[fname]
    if len(feature) == nvalues:
        logger.info(
            "Feature %d: %s is a numerical feature of type %s with maximum number of values %d",
            fnum, fname, dt, nvalues)
    else:
        logger.info(
            "Feature %d: %s is a numerical feature of type %s with %d unique values",
            fnum, fname, dt, nvalues)
    # imputer for float, integer, or boolean data types
    new_values = impute_values(feature, dt, sentinel)
    # log-transform any values that do not fit a normal distribution
    new_fname = fname
    if logt and np.all(new_values > 0):
        _, pvalue = sps.normaltest(new_values)
        if pvalue <= plevel:
            logger.info(
                "Feature %d: %s is not normally distributed [p-value: %f]",
                fnum, fname, pvalue)
            new_values = np.log(new_values)
        else:
            new_fname = USEP.join([new_fname, 'log'])
    return new_values, [new_fname]
예제 #8
0
파일: features.py 프로젝트: rydeen7/AlphaPy
def create_tsne_features(features, model):
    r"""Create t-SNE features.

    Parameters
    ----------
    features : numpy array
        The input features.
    model : alphapy.Model
        The model object with the t-SNE parameters.

    Returns
    -------
    tfeatures : numpy array
        The t-SNE features.
    tnames : list
        The t-SNE feature names.

    References
    ----------
    You can find more information on the t-SNE technique here [TSNE]_.

    .. [TSNE] http://scikit-learn.org/stable/modules/manifold.html#t-distributed-stochastic-neighbor-embedding-t-sne

    """

    logger.info("Creating T-SNE Features")

    # Extract model parameters

    seed = model.specs['seed']
    tsne_components = model.specs['tsne_components']
    tsne_learn_rate = model.specs['tsne_learn_rate']
    tsne_perplexity = model.specs['tsne_perplexity']

    # Log model parameters

    logger.info("T-SNE Components    : %d", tsne_components)
    logger.info("T-SNE Learning Rate : %d", tsne_learn_rate)
    logger.info("T-SNE Perplexity    : %d", tsne_perplexity)

    # Generate T-SNE features

    model = TSNE(n_components=tsne_components,
                 perplexity=tsne_perplexity,
                 learning_rate=tsne_learn_rate,
                 random_state=seed)
    tfeatures = model.fit_transform(features)
    tnames = [USEP.join(['tsne', str(i + 1)]) for i in range(tsne_components)]

    # Return new T-SNE features

    logger.info("T-SNE Feature Count : %d", tfeatures.shape[1])
    return tfeatures, tnames
예제 #9
0
def analysis_name(gname, target):
    r"""Get the name of the analysis.

    Parameters
    ----------
    gname : str
        Group name.
    target : str
        Target of the analysis.

    Returns
    -------
    name : str
        Value for the corresponding key.

    """
    name = USEP.join([gname, target])
    return name
예제 #10
0
파일: space.py 프로젝트: ywuywu/ml_monorepo
def space_name(subject, schema, fractal):
    r"""Get the namespace string.

    Parameters
    ----------
    subject : str
        An identifier for a group of related items.
    schema : str
        The data related to the ``subject``.
    fractal : str
        The time fractal of the data, e.g., "5m" or "1d".

    Returns
    -------
    name : str
        The joined namespace string.

    """
    name = USEP.join([subject, schema, fractal])
    return name
예제 #11
0
def frame_name(name, space):
    r"""Get the frame name for the given name and space.

    Parameters
    ----------
    name : str
        Group name.
    space : alphapy.Space
        Context or namespace for the given group name.

    Returns
    -------
    fname : str
        Frame name.

    Examples
    --------

    >>> fname = frame_name('tech', Space('stock', 'prices', '1d'))
    # 'tech_stock_prices_1d'

    """
    return USEP.join([name, space.subject, space.schema, space.fractal])
예제 #12
0
def plot_importance(model, partition):
    r"""Display scikit-learn feature importances.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Feature Importance Plots")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For each algorithm that has importances, generate the plot.

    n_top = 20

    for algo in model.algolist:
        logger.info("Feature Importances for Algorithm: %s", algo)
        try:
            # get feature importances
            importances = np.array(model.importances[algo])
            imp_flag = True
        except:
            imp_flag = False
        if imp_flag:
            # sort the importances by index
            indices = np.argsort(importances)[::-1]
            # get feature names
            feature_names = np.array(model.fnames_algo[algo])
            n_features = len(feature_names)
            # log the feature ranking
            logger.info("Feature Ranking:")
            n_min = min(n_top, n_features)
            for i in range(n_min):
                logger.info("%d. %s (%f)" % (i + 1, feature_names[indices[i]],
                                             importances[indices[i]]))
            # plot the feature importances
            title = BSEP.join([algo, "Feature Importances [", pstring, "]"])
            plt.figure()
            plt.title(title)
            plt.barh(range(n_min), importances[indices][:n_min][::-1])
            plt.yticks(range(n_min), feature_names[indices][:n_min][::-1])
            plt.ylim([-1, n_min])
            plt.xlabel('Relative Importance')
            # save the plot
            tag = USEP.join([pstring, algo])
            write_plot('matplotlib', plt, 'feature_importance', tag, plot_dir)
        else:
            logger.info("No Feature Importances for %s" % algo)
예제 #13
0
def create_features(model, X):
    r"""Create features for the train and test set.

    Parameters
    ----------
    model : alphapy.Model
        Model object with the feature specifications.
    X : pandas.DataFrame
        Combined train and test data.

    Returns
    -------
    all_features : numpy array
        The new features.

    Raises
    ------
    TypeError
        Unrecognized data type.

    """

    # Extract model parameters

    clustering = model.specs['clustering']
    counts_flag = model.specs['counts']
    encoder = model.specs['encoder']
    factors = model.specs['factors']
    isomap = model.specs['isomap']
    logtransform = model.specs['logtransform']
    model_type = model.specs['model_type']
    ngrams_max = model.specs['ngrams_max']
    numpy_flag = model.specs['numpy']
    pca = model.specs['pca']
    pvalue_level = model.specs['pvalue_level']
    rounding = model.specs['rounding']
    scaling = model.specs['scaler_option']
    scaler = model.specs['scaler_type']
    scipy_flag = model.specs['scipy']
    sentinel = model.specs['sentinel']
    target_value = model.specs['target_value']
    tsne = model.specs['tsne']
    vectorize = model.specs['vectorize']

    # Log input parameters

    logger.info("Original Features : %s", X.columns)
    logger.info("Feature Count     : %d", X.shape[1])

    # Set classification flag

    classify = True if model_type == ModelType.classification else False

    # Count zero and NaN values

    if counts_flag:
        logger.info("Creating Count Features")
        logger.info("NA Counts")
        X['nan_count'] = X.count(axis=1)
        logger.info("Number Counts")
        for i in range(10):
            fc = USEP.join(['count', str(i)])
            X[fc] = (X == i).astype(int).sum(axis=1)
        logger.info("New Feature Count : %d", X.shape[1])

    # Iterate through columns, dispatching and transforming each feature.

    logger.info("Creating Base Features")
    all_features = np.zeros((X.shape[0], 1))

    for i, fc in enumerate(X):
        fnum = i + 1
        dtype = X[fc].dtypes
        nunique = len(X[fc].unique())
        # standard processing of numerical, categorical, and text features
        if factors and fc in factors:
            features = get_factors(model, X, fnum, fc, nunique, dtype, encoder,
                                   rounding, sentinel)
        elif dtype == 'float64' or dtype == 'int64' or dtype == 'bool':
            features = get_numerical_features(fnum, fc, X, nunique, dtype,
                                              sentinel, logtransform,
                                              pvalue_level)
        elif dtype == 'object':
            features = get_text_features(fnum, fc, X, nunique, vectorize,
                                         ngrams_max)
        else:
            raise TypeError("Base Feature Error with unrecognized type %s" %
                            dtype)
        if features.shape[0] == all_features.shape[0]:
            all_features = np.column_stack((all_features, features))
        else:
            logger.info("Feature %s has the wrong number of rows: %d", fc,
                        features.shape[0])
    all_features = np.delete(all_features, 0, axis=1)

    logger.info("New Feature Count : %d", all_features.shape[1])

    # Call standard scaler for all features

    if scaling:
        logger.info("Scaling Base Features")
        if scaler == Scalers.standard:
            all_features = StandardScaler().fit_transform(all_features)
        elif scaler == Scalers.minmax:
            all_features = MinMaxScaler().fit_transform(all_features)
        else:
            logger.info("Unrecognized scaler: %s", scaler)
    else:
        logger.info("Skipping Scaling")

    # Perform dimensionality reduction only on base feature set
    base_features = all_features

    # Calculate the total, mean, standard deviation, and variance

    if numpy_flag:
        np_features = create_numpy_features(base_features, sentinel)
        all_features = np.column_stack((all_features, np_features))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Generate scipy features

    if scipy_flag:
        sp_features = create_scipy_features(base_features, sentinel)
        all_features = np.column_stack((all_features, sp_features))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Create clustering features

    if clustering:
        cfeatures = create_clusters(base_features, model)
        all_features = np.column_stack((all_features, cfeatures))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Create PCA features

    if pca:
        pfeatures = create_pca_features(base_features, model)
        all_features = np.column_stack((all_features, pfeatures))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Create Isomap features

    if isomap:
        ifeatures = create_isomap_features(base_features, model)
        all_features = np.column_stack((all_features, ifeatures))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Create T-SNE features

    if tsne:
        tfeatures = create_tsne_features(base_features, model)
        all_features = np.column_stack((all_features, tfeatures))
        logger.info("New Feature Count : %d", all_features.shape[1])

    # Return all transformed training and test features
    return all_features
예제 #14
0
def training_pipeline(model):
    r"""AlphaPy Training Pipeline

    Parameters
    ----------
    model : alphapy.Model
        The model object for controlling the pipeline.

    Returns
    -------
    model : alphapy.Model
        The final results are stored in the model object.

    Raises
    ------
    KeyError
        If the number of columns of the train and test data do not match,
        then this exception is raised.

    """

    logger.info("Training Pipeline")

    # Unpack the model specifications

    calibration = model.specs['calibration']
    directory = model.specs['directory']
    drop = model.specs['drop']
    extension = model.specs['extension']
    feature_selection = model.specs['feature_selection']
    grid_search = model.specs['grid_search']
    model_type = model.specs['model_type']
    predict_mode = model.specs['predict_mode']
    rfe = model.specs['rfe']
    sampling = model.specs['sampling']
    scorer = model.specs['scorer']
    separator = model.specs['separator']
    target = model.specs['target']

    # Get train and test data

    X_train, y_train = get_data(model, Partition.train)
    X_test, y_test = get_data(model, Partition.test)

    # Determine if there are any test labels

    if y_test.any():
        logger.info("Test Labels Found")
        model.test_labels = True
    model = save_features(model, X_train, X_test, y_train, y_test)

    # Log feature statistics

    logger.info("Original Feature Statistics")
    logger.info("Number of Training Rows    : %d", X_train.shape[0])
    logger.info("Number of Training Columns : %d", X_train.shape[1])
    if model_type == ModelType.classification:
        uv, uc = np.unique(y_train, return_counts=True)
        logger.info("Unique Training Values for %s : %s", target, uv)
        logger.info("Unique Training Counts for %s : %s", target, uc)
    logger.info("Number of Testing Rows     : %d", X_test.shape[0])
    logger.info("Number of Testing Columns  : %d", X_test.shape[1])
    if model_type == ModelType.classification and model.test_labels:
        uv, uc = np.unique(y_test, return_counts=True)
        logger.info("Unique Testing Values for %s : %s", target, uv)
        logger.info("Unique Testing Counts for %s : %s", target, uc)

    # Merge training and test data

    if X_train.shape[1] == X_test.shape[1]:
        split_point = X_train.shape[0]
        X = pd.concat([X_train, X_test])
    else:
        raise IndexError(
            "The number of training and test columns [%d, %d] must match." %
            (X_train.shape[1], X_test.shape[1]))

    # Apply treatments to the feature matrix
    all_features = apply_treatments(model, X)

    # Drop features
    all_features = drop_features(all_features, drop)

    # Save the train and test files with extracted and dropped features

    datestamp = get_datestamp()
    data_dir = SSEP.join([directory, 'input'])
    df_train = all_features.iloc[:split_point, :]
    df_train = pd.concat(
        [df_train, pd.DataFrame(y_train, columns=[target])], axis=1)
    output_file = USEP.join([model.train_file, datestamp])
    write_frame(df_train, data_dir, output_file, extension, separator)
    df_test = all_features.iloc[split_point:, :]
    if y_test.any():
        df_test = pd.concat(
            [df_test, pd.DataFrame(y_test, columns=[target])], axis=1)
    output_file = USEP.join([model.test_file, datestamp])
    write_frame(df_test, data_dir, output_file, extension, separator)

    # Create crosstabs for any categorical features

    if model_type == ModelType.classification:
        create_crosstabs(model)

    # Create initial features

    all_features = create_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Generate interactions

    all_features = create_interactions(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Remove low-variance features

    all_features = remove_lv_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Shuffle the data [if specified]
    model = shuffle_data(model)

    # Oversampling or Undersampling [if specified]

    if model_type == ModelType.classification:
        if sampling:
            model = sample_data(model)
        else:
            logger.info("Skipping Sampling")
        # Get sample weights (classification only)
        model = get_class_weights(model)

    # Perform feature selection, independent of algorithm

    if feature_selection:
        model = select_features(model)

    # Get the available classifiers and regressors

    logger.info("Getting All Estimators")
    estimators = get_estimators(model)

    # Get the available scorers

    if scorer not in scorers:
        raise KeyError("Scorer function %s not found" % scorer)

    # Model Selection

    logger.info("Selecting Models")

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # select estimator
        try:
            estimator = estimators[algo]
            scoring = estimator.scoring
            est = estimator.estimator
        except KeyError:
            logger.info("Algorithm %s not found", algo)
        # initial fit
        model = first_fit(model, algo, est)
        # recursive feature elimination
        if rfe:
            if scoring:
                model = rfecv_search(model, algo)
            elif hasattr(est, "coef_"):
                model = rfe_search(model, algo)
            else:
                logger.info("No RFE Available for %s", algo)
        # grid search
        if grid_search:
            model = hyper_grid_search(model, estimator)
        # predictions
        model = make_predictions(model, algo, calibration)

    # Create a blended estimator

    if len(model.algolist) > 1:
        model = predict_blend(model)

    # Generate metrics

    model = generate_metrics(model, Partition.train)
    model = generate_metrics(model, Partition.test)

    # Store the best estimator
    model = predict_best(model)

    # Generate plots

    generate_plots(model, Partition.train)
    if model.test_labels:
        generate_plots(model, Partition.test)

    # Save best features and predictions
    save_model(model, 'BEST', Partition.test)

    # Return the model
    return model
예제 #15
0
def plot_validation_curve(model, partition, pname, prange):
    r"""Generate scikit-learn validation curves.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.
    pname : str
        Name of the hyperparameter to test.
    prange : numpy array
        The values of the hyperparameter that will be evaluated.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py

    """

    logger.info("Generating Validation Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    scorer = model.specs['scorer']
    verbosity = model.specs['verbosity']

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Define plotting constants.

    spacing = 0.5
    alpha = 0.2

    # Calculate a validation curve for each algorithm.

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # get estimator
        estimator = model.estimators[algo]
        # set up plot
        train_scores, test_scores = validation_curve(
            estimator, X, y, param_name=pname, param_range=prange,
            cv=cv_folds, scoring=scorer, n_jobs=n_jobs)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        # set up figure
        plt.style.use('classic')
        plt.figure()
        # plot learning curves
        title = BSEP.join([algo, "Validation Curve [", pstring, "]"])
        plt.title(title)
        # x-axis
        x_min, x_max = min(prange) - spacing, max(prange) + spacing
        plt.xlabel(pname)
        plt.xlim(x_min, x_max)
        # y-axis
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        # plot scores
        plt.plot(prange, train_scores_mean, label="Training Score", color="r")
        plt.fill_between(prange, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=alpha, color="r")
        plt.plot(prange, test_scores_mean, label="Cross-Validation Score",
                 color="g")
        plt.fill_between(prange, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=alpha, color="g")
        plt.legend(loc="best")        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'validation_curve', tag, plot_dir)
예제 #16
0
def plot_confusion_matrix(model, partition):
    r"""Draw the confusion matrix.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix

    """

    logger.info("Generating Confusion Matrices")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('Confusion Matrix is for classification only')
        return None

    # Get X, Y for correct partition.
    X, y = get_partition_data(model, partition)

    # Plot Parameters
    np.set_printoptions(precision=2)
    cmap = plt.cm.Blues
    fmt = '.2f'

    # Generate a Confusion Matrix for each algorithm

    for algo in model.algolist:
        logger.info("Confusion Matrix for Algorithm: %s", algo)

        # get predictions for this partition
        y_pred = model.preds[(algo, partition)]

        # compute confusion matrix
        cm = confusion_matrix(y, y_pred)
        logger.info('Confusion Matrix:')
        logger.info('%s', cm)

        # normalize confusion matrix
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        # initialize plot
        _, ax = plt.subplots()

        # set the title of the confusion matrix
        title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"])
        plt.title(title)

        # only use the labels that appear in the data
        classes = unique_labels(y, y_pred)

        # show all ticks
        ax.set(xticks=np.arange(cm.shape[1]),
            yticks=np.arange(cm.shape[0]),
            xticklabels=classes, yticklabels=classes,
            title=title,
            ylabel='True Label',
            xlabel='Predicted Label')

        # rotate the tick labels and set their alignment
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                rotation_mode="anchor")

        # loop over data dimensions and create text annotations
        thresh = (cm.max() + cm.min()) / 2.0
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")

        # show the color bar
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.figure.colorbar(im, ax=ax)

        # save the chart
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
예제 #17
0
def plot_learning_curve(model, partition):
    r"""Generate learning curves for a given partition.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

    """

    logger.info("Generating Learning Curves")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # Extract model parameters.

    cv_folds = model.specs['cv_folds']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    shuffle = model.specs['shuffle']
    verbosity = model.specs['verbosity']

    # Get original estimators

    estimators = get_estimators(model)

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    # Set cross-validation parameters to get mean train and test curves.

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=seed)

    # Plot a learning curve for each algorithm.

    ylim = (0.4, 1.01)

    for algo in model.algolist:
        logger.info("Learning Curve for Algorithm: %s", algo)
        # get estimator
        est = estimators[algo].estimator
        # plot learning curve
        title = BSEP.join([algo, "Learning Curve [", pstring, "]"])
        # set up plot
        plt.style.use('classic')
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel("Training Examples")
        plt.ylabel("Score")
        # call learning curve function
        train_sizes=np.linspace(0.1, 1.0, cv_folds)
        train_sizes, train_scores, test_scores = \
            learning_curve(est, X, y, train_sizes=train_sizes, cv=cv,
                           n_jobs=n_jobs, verbose=verbosity)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        plt.grid()
        # plot data
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="g")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training Score")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-Validation Score")
        plt.legend(loc="lower right")
        # save the plot
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'learning_curve', tag, plot_dir)
예제 #18
0
def save_predictions(model, tag, partition):
    r"""Save the predictions to disk.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    preds : numpy array
        The prediction vector.
    probas : numpy array
        The probability vector.

    """

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    separator = model.specs['separator']

    # Get date stamp to record file creation
    timestamp = get_datestamp()

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Read the prediction frame
    file_spec = ''.join([datasets[partition], '*'])
    file_name = most_recent_file(input_dir, file_spec)
    file_name = file_name.split(SSEP)[-1].split(PSEP)[0]
    pf = read_frame(input_dir, file_name, extension, separator)

    # Cull records before the prediction date

    try:
        predict_date = model.specs['predict_date']
        found_pdate = True
    except:
        found_pdate = False

    if found_pdate:
        pd_indices = pf[pf.date >= predict_date].index.tolist()
        pf = pf.iloc[pd_indices]
    else:
        pd_indices = pf.index.tolist()

    # Save predictions for all projects

    logger.info("Saving Predictions")
    output_file = USEP.join(['predictions', timestamp])
    preds = model.preds[(tag, partition)].squeeze()
    if found_pdate:
        preds = np.take(preds, pd_indices)
    pred_series = pd.Series(preds, index=pd_indices)
    df_pred = pd.DataFrame(pred_series, columns=['prediction'])
    write_frame(df_pred, output_dir, output_file, extension, separator)

    # Save probabilities for classification projects

    probas = None
    if model_type == ModelType.classification:
        logger.info("Saving Probabilities")
        output_file = USEP.join(['probabilities', timestamp])
        probas = model.probas[(tag, partition)].squeeze()
        if found_pdate:
            probas = np.take(probas, pd_indices)
        prob_series = pd.Series(probas, index=pd_indices)
        df_prob = pd.DataFrame(prob_series, columns=['probability'])
        write_frame(df_prob, output_dir, output_file, extension, separator)

    # Save ranked predictions

    logger.info("Saving Ranked Predictions")
    pf['prediction'] = pred_series
    if model_type == ModelType.classification:
        pf['probability'] = prob_series
        pf.sort_values('probability', ascending=False, inplace=True)
    else:
        pf.sort_values('prediction', ascending=False, inplace=True)
    output_file = USEP.join(['rankings', timestamp])
    write_frame(pf, output_dir, output_file, extension, separator)

    # Return predictions and any probabilities
    return preds, probas
예제 #19
0
def plot_confusion_matrix(model, partition):
    r"""Draw the confusion matrix.

    Parameters
    ----------
    model : alphapy.Model
        The model object with plotting specifications.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    References
    ----------

    http://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix

    """

    logger.info("Generating Confusion Matrices")
    plot_dir = get_plot_directory(model)
    pstring = datasets[partition]

    # For classification only

    if model.specs['model_type'] != ModelType.classification:
        logger.info('Confusion Matrix is for classification only')
        return None

    # Get X, Y for correct partition.

    X, y = get_partition_data(model, partition)

    for algo in model.algolist:
        logger.info("Confusion Matrix for Algorithm: %s", algo)
        # get predictions for this partition
        y_pred = model.preds[(algo, partition)]
        # compute confusion matrix
        cm = confusion_matrix(y, y_pred)
        logger.info('Confusion Matrix:')
        logger.info('%s', cm)
        # initialize plot
        np.set_printoptions(precision=2)
        plt.style.use('classic')
        plt.figure()
        # plot the confusion matrix
        cmap = plt.cm.Blues
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        title = BSEP.join([algo, "Confusion Matrix [", pstring, "]"])
        plt.title(title)
        plt.colorbar()
        # set up x and y axes
        y_values, y_counts = np.unique(y, return_counts=True)
        tick_marks = np.arange(len(y_values))
        plt.xticks(tick_marks, y_values, rotation=45)
        plt.yticks(tick_marks, y_values)
        # normalize confusion matrix
        cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        # place text in square of confusion matrix
        thresh = (cm.max() + cm.min()) / 2.0
        for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
            cmr = round(cmn[i, j], 3)
            plt.text(j,
                     i,
                     cmr,
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        # labels
        plt.tight_layout()
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        # save the chart
        tag = USEP.join([pstring, algo])
        write_plot('matplotlib', plt, 'confusion', tag, plot_dir)
예제 #20
0
def save_model(model, tag, partition):
    r"""Save the results in the model file.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    Notes
    -----

    The following components are extracted from the model object
    and saved to disk:

    * Model predictor (via joblib/pickle)
    * Predictions
    * Probabilities (classification only)
    * Rankings
    * Submission File (optional)

    """

    logger.info('=' * 80)

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    submission_file = model.specs['submission_file']
    submit_probas = model.specs['submit_probas']

    # Get date stamp to record file creation

    d = datetime.now()
    f = "%Y%m%d"
    timestamp = d.strftime(f)

    # Save the model predictor
    save_predictor(model, timestamp)

    # Save the feature map
    save_feature_map(model, timestamp)

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Save predictions
    preds, probas = save_predictions(model, tag, partition)

    # Generate submission file

    if submission_file:
        sample_spec = PSEP.join([submission_file, extension])
        sample_input = SSEP.join([input_dir, sample_spec])
        ss = pd.read_csv(sample_input)
        if submit_probas and model_type == ModelType.classification:
            ss[ss.columns[1]] = probas
        else:
            ss[ss.columns[1]] = preds
        submission_base = USEP.join(['submission', timestamp])
        submission_spec = PSEP.join([submission_base, extension])
        submission_output = SSEP.join([output_dir, submission_spec])
        logger.info("Saving Submission to %s", submission_output)
        ss.to_csv(submission_output, index=False)
예제 #21
0
def main(args=None):
    r"""The main program for SportFlow.

    Notes
    -----
    (1) Initialize logging.
    (2) Parse the command line arguments.
    (3) Get the game configuration.
    (4) Get the model configuration.
    (5) Generate game frames for each season.
    (6) Create statistics for each team.
    (7) Merge the team frames into the final model frame.
    (8) Run the AlphaPy pipeline.

    Raises
    ------
    ValueError
        Training date must be before prediction date.

    """

    # Logging

    logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
                        filename="sport_flow.log", filemode='a', level=logging.DEBUG,
                        datefmt='%m/%d/%y %H:%M:%S')
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
                                  datefmt='%m/%d/%y %H:%M:%S')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    logger = logging.getLogger(__name__)

    # Start the pipeline

    logger.info('*'*80)
    logger.info("SportFlow Start")
    logger.info('*'*80)

    # Argument Parsing

    parser = argparse.ArgumentParser(description="SportFlow Parser")
    parser.add_argument('--pdate', dest='predict_date',
                        help="prediction date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_argument('--tdate', dest='train_date',
                        help="training date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument('--predict', dest='predict_mode', action='store_true')
    parser.add_argument('--train', dest='predict_mode', action='store_false')
    parser.set_defaults(predict_mode=False)
    args = parser.parse_args()

    # Set train and predict dates

    if args.train_date:
        train_date = args.train_date
    else:
        train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d")

    if args.predict_date:
        predict_date = args.predict_date
    else:
        predict_date = datetime.date.today().strftime("%Y-%m-%d")

    # Verify that the dates are in sequence.

    if train_date >= predict_date:
        raise ValueError("Training date must be before prediction date")
    else:
        logger.info("Training Date: %s", train_date)
        logger.info("Prediction Date: %s", predict_date)

    # Read game configuration file

    sport_specs = get_sport_config()

    # Section: game

    league = sport_specs['league']
    points_max = sport_specs['points_max']
    points_min = sport_specs['points_min']
    random_scoring = sport_specs['random_scoring']
    seasons = sport_specs['seasons']
    window = sport_specs['rolling_window']   

    # Read model configuration file

    specs = get_model_config()

    # Add command line arguments to model specifications

    specs['predict_mode'] = args.predict_mode
    specs['predict_date'] = args.predict_date
    specs['train_date'] = args.train_date

    # Unpack model arguments

    directory = specs['directory']
    target = specs['target']

    # Create directories if necessary

    output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots']
    for od in output_dirs:
        output_dir = SSEP.join([directory, od])
        if not os.path.exists(output_dir):
            logger.info("Creating directory %s", output_dir)
            os.makedirs(output_dir)

    # Create the game scores space
    space = Space('game', 'scores', '1g')

    #
    # Derived Variables
    #

    series = space.schema
    team1_prefix = 'home'
    team2_prefix = 'away'
    home_team = PSEP.join([team1_prefix, 'team'])
    away_team = PSEP.join([team2_prefix, 'team'])

    #
    # Read in the game frame. This is the feature generation phase.
    #

    logger.info("Reading Game Data")

    data_dir = SSEP.join([directory, 'data'])
    file_base = USEP.join([league, space.subject, space.schema, space.fractal])
    df = read_frame(data_dir, file_base, specs['extension'], specs['separator'])
    logger.info("Total Game Records: %d", df.shape[0])

    #
    # Locate any rows with null values
    #

    null_rows = df.isnull().any(axis=1)
    null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True]
    for i in null_indices:
        logger.info("Null Record: %d on Date: %s", i, df.date[i])

    #
    # Run the game pipeline on a seasonal loop
    #

    if not seasons:
        # run model on all seasons
        seasons = df['season'].unique().tolist()

    #
    # Initialize the final frame
    #

    ff = pd.DataFrame()

    #
    # Iterate through each season of the game frame
    #

    for season in seasons:

        # Generate a frame for each season

        gf = df[df['season'] == season]
        gf = gf.reset_index()

        # Generate derived variables for the game frame

        total_games = gf.shape[0]
        if random_scoring:
            gf['home.score'] = np.random.randint(points_min, points_max, total_games)
            gf['away.score'] = np.random.randint(points_min, points_max, total_games)
        gf['total_points'] = gf['home.score'] + gf['away.score']

        gf = add_features(gf, game_dict, gf.shape[0])
        for index, row in gf.iterrows():
            gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score')
            gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False
            gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False
            gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line']
            gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False
            gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False
            gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under']
            gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False
            gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False

        # Generate each team frame

        team_frames = {}
        teams = gf.groupby([home_team])
        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Generating team frame: %s", team_frame)
            tf = get_team_frame(gf, team, home_team, away_team)
            tf = tf.reset_index()
            tf = generate_team_frame(team, tf, home_team, away_team, window)
            team_frames[team_frame] = tf

        # Create the model frame, initializing the home and away frames

        mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool}
        team1_frame = pd.DataFrame()
        team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix)
        team2_frame = pd.DataFrame()
        team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix)
        frames = [gf, team1_frame, team2_frame]
        mf = pd.concat(frames, axis=1)

        # Loop through each team frame, inserting data into the model frame row
        #     get index+1 [if valid]
        #     determine if team is home or away to get prefix
        #     try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0]
        #     Assign team frame fields to respective model frame fields: set gf.at(pos, field)

        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Merging team frame %s into model frame", team_frame)
            tf = team_frames[team_frame]
            for index in range(0, tf.shape[0]-1):
                gindex = index + 1
                model_row = tf.iloc[gindex]
                key_date = model_row['date']
                at_home = False
                if team == model_row[home_team]:
                    at_home = True
                    key_team = model_row[home_team]
                elif team == model_row[away_team]:
                    key_team = model_row[away_team]
                else:
                    raise KeyError("Team %s not found in Team Frame" % team)            
                try:
                    if at_home:
                        mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0]
                    else:
                        mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0]
                except:
                    raise IndexError("Team/Date Key not found in Model Frame")
                # print team, gindex, mpos
                # insert team data into model row
                mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix)

        # Compute delta data 'home' - 'away'
        mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix)

        # Append this to final frame
        frames = [ff, mf]
        ff = pd.concat(frames)

    # Write out dataframes

    input_dir = SSEP.join([directory, 'input'])
    if args.predict_mode:
        new_predict_frame = ff.loc[ff.date >= predict_date]
        if len(new_predict_frame) <= 1:
            raise ValueError("Prediction frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving prediction frame")
        write_frame(new_predict_frame, input_dir, datasets[Partition.predict],
                    specs['extension'], specs['separator'])
    else:
        # split data into training and test data
        new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)]
        if len(new_train_frame) <= 1:
            raise ValueError("Training frame has length 1 or less")
        new_test_frame = ff.loc[ff.date >= predict_date]
        if len(new_test_frame) <= 1:
            raise ValueError("Testing frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving training frame")
        write_frame(new_train_frame, input_dir, datasets[Partition.train],
                    specs['extension'], specs['separator'])
        logger.info("Saving testing frame")
        write_frame(new_test_frame, input_dir, datasets[Partition.test],
                    specs['extension'], specs['separator'])

    # Create the model from specs

    logger.info("Running Model")
    model = Model(specs)

    # Run the pipeline
    model = main_pipeline(model)

    # Complete the pipeline

    logger.info('*'*80)
    logger.info("SportFlow End")
    logger.info('*'*80)