Beispiel #1
0
def generate_delta_data(frame, fdict, prefix1, prefix2):
    r"""Subtract two similar columns to get the delta value.

    Parameters
    ----------
    frame : pandas.DataFrame
        The input model frame.
    fdict : dict
        A dictionary of column names (key) and data types (value).
    prefix1 : str
        The prefix of the first team.
    prefix2 : str
        The prefix of the second team.

    Returns
    -------
    frame : pandas.DataFrame
        The completed dataframe with the delta data.

    """
    for key, value in list(fdict.items()):
        newkey = PSEP.join(['delta', key])
        key1 = PSEP.join([prefix1, key])
        key2 = PSEP.join([prefix2, key])
        frame[newkey] = frame[key1] - frame[key2]
    return frame
Beispiel #2
0
def runs_test(f, c, wfuncs, window):
    r"""Perform a runs test on binary series.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe containing the column ``c``.
    c : str
        Name of the column in the dataframe ``f``.
    wfuncs : list
        The set of runs test functions to apply to the column:

        ``'all'``:
            Run all of the functions below.
        ``'rtotal'``:
            The running total over the ``window`` period.
        ``'runs'``:
            Total number of runs in ``window``.
        ``'streak'``:
            The length of the latest streak.
        ``'zscore'``:
            The Z-Score over the ``window`` period.
    window : int
        The rolling period.

    Returns
    -------
    new_features : pandas.DataFrame
        The dataframe containing the runs test features.

    References
    ----------
    For more information about runs tests for detecting non-randomness,
    refer to [RUNS]_.

    .. [RUNS] http://www.itl.nist.gov/div898/handbook/eda/section3/eda35d.htm

    """

    fc = f[c]
    all_funcs = {'runs'   : runs,
                 'streak' : streak,
                 'rtotal' : rtotal,
                 'zscore' : zscore}
    # use all functions
    if 'all' in wfuncs:
        wfuncs = list(all_funcs.keys())
    # apply each of the runs functions
    new_features = pd.DataFrame()
    for w in wfuncs:
        if w in all_funcs:
            new_feature = fc.rolling(window=window).apply(all_funcs[w])
            new_feature.fillna(0, inplace=True)
            new_column_name = PSEP.join([c, w])
            new_feature = new_feature.rename(new_column_name)
            frames = [new_features, new_feature]
            new_features = pd.concat(frames, axis=1)
        else:
            logger.info("Runs Function %s not found", w)
    return new_features
Beispiel #3
0
def insert_model_data(mf, mpos, mdict, tf, tpos, prefix):
    r"""Insert a row from the team frame into the model frame.

    Parameters
    ----------
    mf : pandas.DataFrame
        The model frame for a single season.
    mpos : int
        The position in the model frame where to insert the row.
    mdict : dict
        A dictionary of column names (key) and data types (value).
    tf : pandas.DataFrame
        The team frame for a season.
    tpos : int
        The position of the row in the team frame.
    prefix : str
        The prefix to join with the ``mdict`` key.

    Returns
    -------
    mf : pandas.DataFrame
        The .

    """
    team_row = tf.iloc[tpos]
    for key, value in list(mdict.items()):
        newkey = key
        if prefix:
            newkey = PSEP.join([prefix, newkey])
        mf.at[mpos, newkey] = team_row[key]
    return mf
Beispiel #4
0
def write_frame(df, directory, filename, extension, separator,
                index=False, index_label=None):
    r"""Write a dataframe into a delimiter-separated file.

    Parameters
    ----------
    df : pandas.DataFrame
        The pandas dataframe to save to a file.
    directory : str
        Full directory specification.
    filename : str
        Name of the file to write, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.
    index : bool, optional
        If ``True``, write the row names (index).
    index_label : str, optional
        A column label for the ``index``.

    Returns
    -------
    None : None

    """
    file_only = PSEP.join([filename, extension])
    file_all = SSEP.join([directory, file_only])
    logger.info("Writing data frame to %s", file_all)
    try:
        df.to_csv(file_all, sep=separator, index=index, index_label=index_label)
    except:
        logger.info("Could not write data frame to %s", file_all)
Beispiel #5
0
def np_store_data(data, dir_name, file_name, extension, separator):
    r"""Store NumPy data in a file.

    Parameters
    ----------
    data : numpy array
        The model component to store
    dir_name : str
        Full directory specification.
    file_name : str
        Name of the file to read, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.

    Returns
    -------
    None : None

    """
    output_file = PSEP.join([file_name, extension])
    output = SSEP.join([dir_name, output_file])
    logger.info("Storing output to %s", output)
    np.savetxt(output, data, delimiter=separator)
Beispiel #6
0
def read_frame(directory,
               filename,
               extension,
               separator,
               index_col=None,
               squeeze=False):
    r"""Read a delimiter-separated file into a data frame.

    Parameters
    ----------
    directory : str
        Full directory specification.
    filename : str
        Name of the file to read, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.
    index_col : str, optional
        Column to use as the row labels in the dataframe.
    squeeze : bool, optional
        If the data contains only one column, then return a pandas Series.

    Returns
    -------
    df : pandas.DataFrame
        The pandas dataframe loaded from the file location. If the file
        cannot be located, then ``None`` is returned.

    """
    file_only = PSEP.join([filename, extension])
    file_all = SSEP.join([directory, file_only])
    logger.info("Loading data from %s", file_all)
    try:
        df = pd.read_csv(file_all,
                         sep=separator,
                         index_col=index_col,
                         squeeze=squeeze,
                         low_memory=False)
    except:
        df = pd.DataFrame()
        logger.info("Could not find or access %s", file_all)
    return df
Beispiel #7
0
def add_features(frame, fdict, flen, prefix=''):
    r"""Add new features to a dataframe with the specified dictionary.

    Parameters
    ----------
    frame : pandas.DataFrame
        The dataframe to extend with new features defined by ``fdict``.
    fdict : dict
        A dictionary of column names (key) and data types (value).
    flen : int
        Length of ``frame``.
    prefix : str, optional
        Prepend all columns with a prefix.

    Returns
    -------
    frame : pandas.DataFrame
        The dataframe with the added features.

    """
    # generate sequences
    seqint = [0] * flen
    seqfloat = [0.0] * flen
    seqbool = [False] * flen
    # initialize new fields in frame
    for key, value in list(fdict.items()):
        newkey = key
        if prefix:
            newkey = PSEP.join([prefix, newkey])
        if value == int:
            frame[newkey] = pd.Series(seqint)
        elif value == float:
            frame[newkey] = pd.Series(seqfloat)
        elif value == bool:
            frame[newkey] = pd.Series(seqbool)
        else:
            raise ValueError("Type to generate feature series not found")
    return frame
Beispiel #8
0
def save_model(model, tag, partition):
    r"""Save the results in the model file.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    Notes
    -----

    The following components are extracted from the model object
    and saved to disk:

    * Model predictor (via joblib/pickle)
    * Predictions
    * Probabilities (classification only)
    * Rankings
    * Submission File (optional)

    """

    logger.info('=' * 80)

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    submission_file = model.specs['submission_file']
    submit_probas = model.specs['submit_probas']

    # Get date stamp to record file creation

    d = datetime.now()
    f = "%Y%m%d"
    timestamp = d.strftime(f)

    # Save the model predictor
    save_predictor(model, timestamp)

    # Save the feature map
    save_feature_map(model, timestamp)

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Save predictions
    preds, probas = save_predictions(model, tag, partition)

    # Generate submission file

    if submission_file:
        sample_spec = PSEP.join([submission_file, extension])
        sample_input = SSEP.join([input_dir, sample_spec])
        ss = pd.read_csv(sample_input)
        if submit_probas and model_type == ModelType.classification:
            ss[ss.columns[1]] = probas
        else:
            ss[ss.columns[1]] = preds
        submission_base = USEP.join(['submission', timestamp])
        submission_spec = PSEP.join([submission_base, extension])
        submission_output = SSEP.join([output_dir, submission_spec])
        logger.info("Saving Submission to %s", submission_output)
        ss.to_csv(submission_output, index=False)
Beispiel #9
0
def main(args=None):
    r"""The main program for SportFlow.

    Notes
    -----
    (1) Initialize logging.
    (2) Parse the command line arguments.
    (3) Get the game configuration.
    (4) Get the model configuration.
    (5) Generate game frames for each season.
    (6) Create statistics for each team.
    (7) Merge the team frames into the final model frame.
    (8) Run the AlphaPy pipeline.

    Raises
    ------
    ValueError
        Training date must be before prediction date.

    """

    # Logging

    logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
                        filename="sport_flow.log", filemode='a', level=logging.DEBUG,
                        datefmt='%m/%d/%y %H:%M:%S')
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
                                  datefmt='%m/%d/%y %H:%M:%S')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    logger = logging.getLogger(__name__)

    # Start the pipeline

    logger.info('*'*80)
    logger.info("SportFlow Start")
    logger.info('*'*80)

    # Argument Parsing

    parser = argparse.ArgumentParser(description="SportFlow Parser")
    parser.add_argument('--pdate', dest='predict_date',
                        help="prediction date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_argument('--tdate', dest='train_date',
                        help="training date is in the format: YYYY-MM-DD",
                        required=False, type=valid_date)
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument('--predict', dest='predict_mode', action='store_true')
    parser.add_argument('--train', dest='predict_mode', action='store_false')
    parser.set_defaults(predict_mode=False)
    args = parser.parse_args()

    # Set train and predict dates

    if args.train_date:
        train_date = args.train_date
    else:
        train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d")

    if args.predict_date:
        predict_date = args.predict_date
    else:
        predict_date = datetime.date.today().strftime("%Y-%m-%d")

    # Verify that the dates are in sequence.

    if train_date >= predict_date:
        raise ValueError("Training date must be before prediction date")
    else:
        logger.info("Training Date: %s", train_date)
        logger.info("Prediction Date: %s", predict_date)

    # Read game configuration file

    sport_specs = get_sport_config()

    # Section: game

    league = sport_specs['league']
    points_max = sport_specs['points_max']
    points_min = sport_specs['points_min']
    random_scoring = sport_specs['random_scoring']
    seasons = sport_specs['seasons']
    window = sport_specs['rolling_window']   

    # Read model configuration file

    specs = get_model_config()

    # Add command line arguments to model specifications

    specs['predict_mode'] = args.predict_mode
    specs['predict_date'] = args.predict_date
    specs['train_date'] = args.train_date

    # Unpack model arguments

    directory = specs['directory']
    target = specs['target']

    # Create directories if necessary

    output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots']
    for od in output_dirs:
        output_dir = SSEP.join([directory, od])
        if not os.path.exists(output_dir):
            logger.info("Creating directory %s", output_dir)
            os.makedirs(output_dir)

    # Create the game scores space
    space = Space('game', 'scores', '1g')

    #
    # Derived Variables
    #

    series = space.schema
    team1_prefix = 'home'
    team2_prefix = 'away'
    home_team = PSEP.join([team1_prefix, 'team'])
    away_team = PSEP.join([team2_prefix, 'team'])

    #
    # Read in the game frame. This is the feature generation phase.
    #

    logger.info("Reading Game Data")

    data_dir = SSEP.join([directory, 'data'])
    file_base = USEP.join([league, space.subject, space.schema, space.fractal])
    df = read_frame(data_dir, file_base, specs['extension'], specs['separator'])
    logger.info("Total Game Records: %d", df.shape[0])

    #
    # Locate any rows with null values
    #

    null_rows = df.isnull().any(axis=1)
    null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True]
    for i in null_indices:
        logger.info("Null Record: %d on Date: %s", i, df.date[i])

    #
    # Run the game pipeline on a seasonal loop
    #

    if not seasons:
        # run model on all seasons
        seasons = df['season'].unique().tolist()

    #
    # Initialize the final frame
    #

    ff = pd.DataFrame()

    #
    # Iterate through each season of the game frame
    #

    for season in seasons:

        # Generate a frame for each season

        gf = df[df['season'] == season]
        gf = gf.reset_index()

        # Generate derived variables for the game frame

        total_games = gf.shape[0]
        if random_scoring:
            gf['home.score'] = np.random.randint(points_min, points_max, total_games)
            gf['away.score'] = np.random.randint(points_min, points_max, total_games)
        gf['total_points'] = gf['home.score'] + gf['away.score']

        gf = add_features(gf, game_dict, gf.shape[0])
        for index, row in gf.iterrows():
            gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score')
            gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False
            gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False
            gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line']
            gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False
            gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False
            gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under']
            gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False
            gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False

        # Generate each team frame

        team_frames = {}
        teams = gf.groupby([home_team])
        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Generating team frame: %s", team_frame)
            tf = get_team_frame(gf, team, home_team, away_team)
            tf = tf.reset_index()
            tf = generate_team_frame(team, tf, home_team, away_team, window)
            team_frames[team_frame] = tf

        # Create the model frame, initializing the home and away frames

        mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool}
        team1_frame = pd.DataFrame()
        team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix)
        team2_frame = pd.DataFrame()
        team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix)
        frames = [gf, team1_frame, team2_frame]
        mf = pd.concat(frames, axis=1)

        # Loop through each team frame, inserting data into the model frame row
        #     get index+1 [if valid]
        #     determine if team is home or away to get prefix
        #     try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0]
        #     Assign team frame fields to respective model frame fields: set gf.at(pos, field)

        for team, data in teams:
            team_frame = USEP.join([league, team.lower(), series, str(season)])
            logger.info("Merging team frame %s into model frame", team_frame)
            tf = team_frames[team_frame]
            for index in range(0, tf.shape[0]-1):
                gindex = index + 1
                model_row = tf.iloc[gindex]
                key_date = model_row['date']
                at_home = False
                if team == model_row[home_team]:
                    at_home = True
                    key_team = model_row[home_team]
                elif team == model_row[away_team]:
                    key_team = model_row[away_team]
                else:
                    raise KeyError("Team %s not found in Team Frame" % team)            
                try:
                    if at_home:
                        mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0]
                    else:
                        mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0]
                except:
                    raise IndexError("Team/Date Key not found in Model Frame")
                # print team, gindex, mpos
                # insert team data into model row
                mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix)

        # Compute delta data 'home' - 'away'
        mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix)

        # Append this to final frame
        frames = [ff, mf]
        ff = pd.concat(frames)

    # Write out dataframes

    input_dir = SSEP.join([directory, 'input'])
    if args.predict_mode:
        new_predict_frame = ff.loc[ff.date >= predict_date]
        if len(new_predict_frame) <= 1:
            raise ValueError("Prediction frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving prediction frame")
        write_frame(new_predict_frame, input_dir, datasets[Partition.predict],
                    specs['extension'], specs['separator'])
    else:
        # split data into training and test data
        new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)]
        if len(new_train_frame) <= 1:
            raise ValueError("Training frame has length 1 or less")
        new_test_frame = ff.loc[ff.date >= predict_date]
        if len(new_test_frame) <= 1:
            raise ValueError("Testing frame has length 1 or less")
        # rewrite with all the features to the train and test files
        logger.info("Saving training frame")
        write_frame(new_train_frame, input_dir, datasets[Partition.train],
                    specs['extension'], specs['separator'])
        logger.info("Saving testing frame")
        write_frame(new_test_frame, input_dir, datasets[Partition.test],
                    specs['extension'], specs['separator'])

    # Create the model from specs

    logger.info("Running Model")
    model = Model(specs)

    # Run the pipeline
    model = main_pipeline(model)

    # Complete the pipeline

    logger.info('*'*80)
    logger.info("SportFlow End")
    logger.info('*'*80)