Ejemplo n.º 1
0
 def __getitem__(self, index):
     "Index by round number, list (or tuple), or slice"
     if isinstance(index, slice):
         if index.step is not None:
             raise ValueError("slice step size must be 1")
         r1, r2 = self.rounds_to_ints(index.start, index.stop)
         rs = list(range(r1, r2 + 1))
         ts = nx.tournament_all(as_str=False)
     elif nx.isint(index):
         rs = [index]
         ts = nx.tournament_all(as_str=False)
     elif isinstance(index, list):
         rs, ts = zip(*index)
         ts = [nx.tournament_int(i) for i in ts]
     elif isinstance(index, tuple):
         if len(index) != 2:
             raise IndexError("tuple index must have length 2")
         r, t = index
         if not nx.isint(r):
             raise IndexError("first element of tuple index must be int")
         if not (nx.isint(t) or nx.isstring(t)):
             msg = "second element of tuple index must be int or str"
             raise IndexError(msg)
         rs = [r]
         ts = [nx.tournament_int(t)]
     else:
         raise IndexError("indexing method not supported")
     self.gets(rs, ts)
     ridx = self.df['round'].isin(rs)
     tidx = self.df['tournament'].isin(ts)
     idx = ridx & tidx
     df = self.df[idx]
     return df
Ejemplo n.º 2
0
def test_tournament_all():
    "test tournament_all"
    t = ['bernie', 'elizabeth', 'jordan', 'ken', 'charles', 'frank', 'hillary']
    ok_(nx.tournament_all() == t, 'wrong tournaments')
    ok_(nx.tournament_all(True) == t, 'wrong tournaments')
    t = [1, 2, 3, 4, 5, 6, 7]
    ok_(nx.tournament_all(False) == t, 'wrong tournaments')
Ejemplo n.º 3
0
 def pairs_df(self):
     """Bool dataframe with names as index and tournaments as columns"""
     names = self.names()
     tourns = nx.tournament_all(active_only=False)
     df = pd.DataFrame(index=names, columns=tourns)
     for name in names:
         for tourn in tourns:
             df.loc[name, tourn] = (name, tourn) in self
     return df
Ejemplo n.º 4
0
def metrics_per_era(data, prediction, tournament, join='data',
                    columns=['logloss', 'auc', 'acc', 'ystd'],
                    era_as_str=False, region_as_str=False, split_pairs=True):
    "Dataframe with columns era, model, and specified metrics. And region list"

    df = prediction.df

    # merge prediction with data (remove features x)
    if join == 'data':
        how = 'left'
    elif join == 'yhat':
        how = 'right'
    elif join == 'inner':
        how = 'inner'
    else:
        raise ValueError("`join` method not recognized")
    yhats_df = df.copy()
    cols = ['era', 'region'] + nx.tournament_all(as_str=True,
                                                 active_only=False)
    data_df = data.df[cols]
    df = pd.merge(data_df, yhats_df, left_index=True, right_index=True,
                  how=how)

    regions = df['region'].unique().tolist()
    if region_as_str:
        regions = [REGION_INT_TO_STR[r] for r in regions]

    # calc metrics for each era
    pairs = yhats_df.columns.values
    metrics = []
    unique_eras = df.era.unique()
    for era in unique_eras:
        idx = df.era.isin([era])
        df_era = df[idx]
        if era_as_str:
            era = ERA_INT_TO_STR[era]
        for pair in pairs:
            if tournament is None:
                # evaluate with targets that model trained on
                tourni = nx.tournament_str(pair[1])
            else:
                # force evaluation targets to be from given tournament
                tourni = nx.tournament_str(tournament)
            y = df_era[tourni].values
            yhat = df_era[pair].values
            m = calc_metrics_arrays(y, yhat, columns)
            m = [era, pair] + m
            metrics.append(m)

    columns = ['era', 'pair'] + columns
    metrics = pd.DataFrame(metrics, columns=columns)

    if split_pairs:
        metrics = add_split_pairs(metrics)

    return metrics, regions
Ejemplo n.º 5
0
def cutoff(lb):
    "Independent calculation of confidence cutoff"
    cols = nx.tournament_all(as_str=True)
    df = pd.DataFrame(columns=cols)
    rounds = np.sort(lb['round'].unique())
    for r in rounds:
        d = lb[lb['round'] == r]
        if r > 112:
            cut = []
            for t in nx.tournament_all(as_str=False):
                dt = d[d.tournament == t]
                cutoff, ignore = calc_cutoff(dt)
                cut.append(cutoff)
        else:
            cut = [np.nan] * 5
        df.loc[r] = cut
    df['mean'] = df.mean(axis=1)
    df.loc['mean'] = df.mean()
    return df
Ejemplo n.º 6
0
def logloss(lb, user):
    "Live logloss for `user`"
    cols = nx.tournament_all()
    df = pd.DataFrame(columns=cols)
    lb = lb[['user', 'round', 'tournament', 'live']]
    rounds = np.sort(lb['round'].unique())
    for r in rounds:
        d = lb[lb['round'] == r]
        dom = []
        for t in nx.tournament_all(as_str=False):
            dt = d[d.tournament == t]
            if user in dt.user.values:
                dm = dt[dt.user == user].live.iloc[0]
                dom.append(dm)
            else:
                dom.append(np.nan)
        df.loc[r] = dom
    df['mean'] = df.mean(axis=1)
    df.loc['mean'] = df.mean()
    return df
Ejemplo n.º 7
0
def dominance(lb, user):
    "Fraction of users that `user` beats in terms of live logloss."
    cols = nx.tournament_all()
    df = pd.DataFrame(columns=cols)
    lb = lb[['user', 'round', 'tournament', 'live']]
    rounds = np.sort(lb['round'].unique())
    for r in rounds:
        d = lb[lb['round'] == r]
        dom = []
        for t in nx.tournament_all(as_str=False):
            dt = d[d.tournament == t]
            dt = dt[dt.live.notna()]
            if user in dt.user.values:
                dm = (dt[dt.user == user].live.iloc[0] < dt.live).mean()
                dom.append(dm)
            else:
                dom.append(np.nan)
        df.loc[r] = dom
    df['mean'] = df.mean(axis=1)
    df.loc['mean'] = df.mean()
    return df
Ejemplo n.º 8
0
def test_backtest_production():
    "Make sure backtest and production run"
    d = testing.micro_data()
    model = nx.fifty()
    with testing.HiddenPrints():
        p = nx.production(model, d)
        ok_(p.shape[1] == 5, 'wrong number of tournaments')
        ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
        p = nx.backtest(model, d, kfold=2)
        ok_(p.shape[1] == 5, 'wrong number of tournaments')
        ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
        for verbosity in (0, 1, 2, 3):
            nx.backtest(model, d, tournament=3, kfold=2, verbosity=verbosity)
            nx.production(model, d, tournament='ken', verbosity=verbosity)
            nx.production(model, d, tournament=4, verbosity=verbosity)
            nx.production(model, d, tournament=None, verbosity=verbosity)
            if verbosity == 3:
                nx.production(model, d, tournament=5, verbosity=verbosity)
                nx.production(model,
                              d,
                              tournament='charles',
                              verbosity=verbosity)
Ejemplo n.º 9
0
def test_run():
    "Make sure run runs"
    d = testing.play_data()
    models = [nx.logistic(), nx.fifty()]
    splitters = [
        nx.TournamentSplitter(d),
        nx.ValidationSplitter(d),
        nx.CheatSplitter(d),
        nx.CVSplitter(d, kfold=2),
        nx.SplitSplitter(d, fit_fraction=0.5)
    ]
    for model in models:
        for splitter in splitters:
            nx.run(model, splitter, tournament=2, verbosity=0)
            nx.run(model, splitter, tournament='bernie', verbosity=0)
            p = nx.run(model, splitter, tournament=None, verbosity=0)
            ok_(p.shape[1] == 5, 'wrong number of tournaments')
            ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
            p = nx.run(model, splitter, verbosity=0)
            ok_(p.shape[1] == 5, 'wrong number of tournaments')
            ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments')
    assert_raises(ValueError, nx.run, None, nx.TournamentSplitter(d))
    assert_raises(ValueError, nx.run, nx.fifty(), nx.TournamentSplitter(d), {})
Ejemplo n.º 10
0
def whatif(lb, users, s, c):
    """
    Profit if `users` had staked `s` and `c` in every tournament.

    Earnings are left in NMR instead of splitting the NMR earnings into
    NMR and USD.

    """
    if isinstance(users, list):
        pass
    elif nx.isstring(users):
        users = [users]
    else:
        raise ValueError("`users` must be str or list (of str)")
    cols = ['nmr_staked', 'nmr_burn', 'nmr_earn', 'nmr_net']
    df = pd.DataFrame(columns=cols)
    lb.insert(0, 'pass', lb['live'] < LOGLOSS_BENCHMARK)
    rounds = np.sort(lb['round'].unique())
    for r in rounds:
        d = lb[lb['round'] == r]
        if r > 112:
            staked = 0
            burn = 0
            earn = 0
            for t in nx.tournament_all(as_str=False):
                dt = d[d.tournament == t]
                if dt.shape[0] > 0:
                    cutoff, ignore = calc_cutoff(dt)
                    if c >= cutoff:
                        idx = dt.user.isin(users)
                        dti = dt[idx]
                        idx = dti['pass']
                        nwin = idx.sum()
                        nlos = (~idx & (dti['live'].notna())).sum()
                        p = (1.0 - cutoff) / cutoff
                        burn += nlos * s
                        earn += nwin * s * p
                        staked += idx.size * s
            net = earn - burn
            df.loc[r] = [staked, burn, earn, net]
        else:
            raise ValueError("`round1` must start at at least 113")
    df.loc['total'] = df.sum()
    return df
Ejemplo n.º 11
0
 def __getitem__(self2, index):
     n = nx.tournament_count(active_only=False)
     if isinstance(index, str):
         if index in nx.tournament_all(as_str=True, active_only=True):
             return self2.df[index].values
         else:
             raise IndexError('string index not recognized')
     elif nx.isint(index):
         if index < 1 or index > n:
             txt = 'tournament number must be between 1 and {}'
             raise IndexError(txt.format(n))
         return self2.df[nx.tournament_str(index)].values
     elif isinstance(index, slice):
         if (index.start is None and index.stop is None
                 and index.step is None):
             # slicing below means a view is returned instead of a copy
             return self2.df.iloc[:, -n:].values
         else:
             raise IndexError('Start, stop, and step of slice must be None')
     else:
         raise IndexError('indexing type not recognized')
Ejemplo n.º 12
0
def load_zip(file_path, verbose=False):
    "Load numerai dataset from zip archive; return Data"

    # load zip
    zf = zipfile.ZipFile(file_path)
    train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0)
    tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0)

    # turn into single dataframe and rename columns
    df = pd.concat([train, tourn], axis=0)
    rename_map = {'data_type': 'region'}
    for i in range(1, N_FEATURES + 1):
        rename_map['feature' + str(i)] = 'x' + str(i)
    for number, name in nx.tournament_iter():
        rename_map['target_' + name] = name
    df.rename(columns=rename_map, inplace=True)

    # convert era, region, and labels to np.float64
    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    n = len(nx.tournament_all())
    df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64')

    # no way we did something wrong, right?
    n = 2 + N_FEATURES + nx.tournament_count()
    if df.shape[1] != n:
        raise IOError("expecting {} columns; found {}".format(n, df.shape[1]))

    # make sure memory is contiguous so that, e.g., data.x is a view
    df = df.copy()

    # to avoid copies we need the dtype of each column to be the same
    if df.dtypes.unique().size != 1:
        raise TypeError("dtype of each column should be the same")

    data = Data(df)
    if verbose:
        print(data)

    return data
Ejemplo n.º 13
0
def pass_rate(lb):
    "Fraction of users who beat benchmark in each round"
    cols = ['all', 'stakers', 'nonstakers', 'above_cutoff', 'below_cutoff']
    df = pd.DataFrame(columns=cols)
    rounds = np.sort(lb['round'].unique())
    for r in rounds:
        d = lb[(lb['round'] == r) & (lb.live.notna())]
        d.insert(0, 'pass', d['live'] < LOGLOSS_BENCHMARK)
        pr_all = d['pass'].mean()
        pr_stakers = d[d['s'] > 0]['pass'].mean()
        pr_nonstakers = d[d['s'] == 0]['pass'].mean()
        if r > 112:
            nabove = 0
            nbelow = 0
            pabove = 0
            pbelow = 0
            for t in nx.tournament_all(as_str=False):
                dt = d[d.tournament == t]
                cutoff, ignore = calc_cutoff(dt)
                nabove += dt[dt.c > cutoff].shape[0]
                nbelow += dt[dt.c < cutoff].shape[0]
                pabove += dt[(dt.c > cutoff) & (dt['pass'])].shape[0]
                pbelow += dt[(dt.c < cutoff) & (dt['pass'])].shape[0]
            if nabove == 0:
                pr_above = np.nan
            else:
                pr_above = 1.0 * pabove / nabove
            if nbelow == 0:
                pr_below = np.nan
            else:
                pr_below = 1.0 * pbelow / nbelow
        else:
            pr_above = np.nan
            pr_below = np.nan
        df.loc[r] = [pr_all, pr_stakers, pr_nonstakers, pr_above, pr_below]
    df.loc['mean'] = df.mean()
    return df
Ejemplo n.º 14
0
def payout(lb):
    "NMR and USD payouts per round"
    cols = [
        'staked_nmr', 'staked_above_cutoff', 'burned_nmr', 'nmr_payout',
        'usd_payout', 'total_payout_in_nmr'
    ]
    df = pd.DataFrame(columns=cols)
    rounds = np.sort(lb['round'].unique())
    lb.insert(0, 'pass', lb['live'] < LOGLOSS_BENCHMARK)
    for r in rounds:
        d = lb[lb['round'] == r]
        if r > 112:
            nmr_cut = 0
            nmr_cut_pass = 0
            for t in nx.tournament_all(as_str=False):
                dt = d[d.tournament == t]
                cutoff, ignore = calc_cutoff(dt)
                nmr_cut += dt[dt.c >= cutoff].sum()['s']
                nmr_cut_pass += dt[(dt.c >= cutoff) & (dt['pass'])].sum()['s']
        else:
            nmr_cut = np.nan
        if cutoff == 0:
            total = np.nan
        else:
            total = nmr_cut_pass * (1.0 - cutoff) / cutoff
        ds = d.sum()
        pay = [
            ds['s'], nmr_cut, ds['nmr_burn'], ds['nmr_stake'], ds['usd_stake'],
            total
        ]
        df.loc[r] = pay
    fraction = df['burned_nmr'] / df['staked_above_cutoff']
    df.insert(3, 'fraction_burned', fraction)
    df.loc['mean'] = df.mean()
    df = df.round(2)
    return df
Ejemplo n.º 15
0
def test_tournament_all():
    """test tournament_all"""

    # Testing all inactive tournaments
    t = [
        'bernie', 'elizabeth', 'jordan', 'ken', 'charles', 'frank', 'hillary',
        'kazutsugi'
    ]
    ok_(nx.tournament_all(active_only=False) == t, 'wrong tournaments')
    ok_(nx.tournament_all(True, active_only=False) == t, 'wrong tournaments')
    t = [1, 2, 3, 4, 5, 6, 7, 8]
    ok_(nx.tournament_all(False, active_only=False) == t, 'wrong tournaments')

    # Testing all active tournaments
    t = ['kazutsugi']
    ok_(nx.tournament_all() == t, 'wrong tournaments')
    ok_(nx.tournament_all(True) == t, 'wrong tournaments')
    t = [8]
    ok_(nx.tournament_all(False) == t, 'wrong tournaments')
Ejemplo n.º 16
0
def run(model, splitter, tournament=None, verbosity=2):
    """
    Run a model/tournament pair (or pairs) through a data splitter.

    Parameters
    ----------
    model : nx.Model, list, tuple
        Prediction model to run through the splitter. Can be a list or tuple
        of prediction models. Model names must be unique.
    splitter : nx.Splitter
        An iterator of fit/predict data pairs.
    tournament : {None, int, str, list, tuple}, optional
        The tournament(s) to run the model through. By default (None) the
        model is run through all active tournaments. If a list or tuple of
        tournaments is given then it must must not contain duplicate
        tournaments.
    verbosity : int, optional
        An integer that determines verbosity. Zero is silent.

    Returns
    -------
    p : nx.Prediction
        A prediction object containing the predictions of the specified
        model/tournament pairs.

    """

    # make list of models
    if isinstance(model, nx.Model):
        models = [model]
    elif isinstance(model, list) or isinstance(model, tuple):
        models = model
    else:
        raise ValueError('`model` must be a model, list, or tuple of models')
    names = [m.name for m in models]
    if len(names) != len(set(names)):
        raise ValueError('`model` cannot contain duplicate names')

    # make list of tournaments
    if tournament is None:
        tournaments = nx.tournament_all()
    elif nx.isint(tournament) or nx.isstring(tournament):
        tournaments = [tournament]
    elif isinstance(tournament, list) or isinstance(tournament, tuple):
        tournaments = tournament
    else:
        msg = '`tournament` must be an integer, string, list, tuple, or None.'
        raise ValueError(msg)
    tournaments = [nx.tournament_str(t) for t in tournaments]
    if len(tournaments) != len(set(tournaments)):
        raise ValueError('`tournament` cannot contain duplicates')

    # loop over all model/tournament pairs
    p = nx.Prediction()
    for m in models:
        for t in tournaments:
            p += run_one(m, splitter, t, verbosity=verbosity)
            splitter.reset()
    splitter.reset()

    return p