Beispiel #1
0
 def y_to_nan(self):
     "Copy of data with y values set to NaN"
     data = self.copy()
     for name in nx.tournament_iter(active_only=True):
         kwargs = {name: np.nan}
         data.df = data.df.assign(**kwargs)
     return data
Beispiel #2
0
def get_user_activities(user):
    "Activity of `user` across all rounds and tournaments as dataframe"
    napi = NumerAPI()
    data = []
    for number, name in nx.tournament_iter():
        data += napi.get_user_activities(user, number)
    flat = [flatten_dict(d) for d in data]
    df = pd.DataFrame.from_dict(flat)
    return df
Beispiel #3
0
 def xnew(self, x_array):
     "Copy of data but with data.x=`x_array`; must have same number of rows"
     if x_array.shape[0] != len(self):
         msg = "`x_array` must have the same number of rows as data"
         raise ValueError(msg)
     shape = (x_array.shape[0], x_array.shape[1] + 7)
     cols = ['x' + str(i) for i in range(x_array.shape[1])]
     cols = ['era', 'region'] + cols
     cols = cols + [name for number, name in nx.tournament_iter()]
     df = pd.DataFrame(data=np.empty(shape, dtype=np.float64),
                       index=self.df.index.copy(deep=True),
                       columns=cols)
     df['era'] = self.df['era'].values.copy()
     df['region'] = self.df['region'].values.copy()
     df.values[:, 2:-5] = x_array
     for number, name in nx.tournament_iter():
         df[name] = self.df[name].values.copy()
     return Data(df)
Beispiel #4
0
 def y_df(self):
     """Copy of targets, y, as a dataframe"""
     columns = []
     data = []
     for number, name in nx.tournament_iter(active_only=True):
         columns.append(name)
         data.append(self.y[number].reshape(-1, 1))
     data = np.hstack(data)
     df = pd.DataFrame(data=data, columns=columns, index=self.ids)
     return df
Beispiel #5
0
def test_tournament_int():
    """test tournament_int"""
    for t_int, t_str in nx.tournament_iter():
        t_int2 = nx.tournament_int(t_int)
        ok_(t_int2 == t_int, "tournament int do not agree")
        t_int2 = nx.tournament_int(t_str)
        ok_(t_int2 == t_int, "tournament int do not agree")
    assert_raises(ValueError, nx.tournament_int, 0)
    assert_raises(ValueError, nx.tournament_int, 'burn')
    assert_raises(ValueError, nx.tournament_int, None)
Beispiel #6
0
def test_tournament_str():
    """test tournament_str"""
    for t_int, t_str in nx.tournament_iter():
        t_str2 = nx.tournament_str(t_int)
        ok_(t_str2 == t_str, "tournament str do not agree")
        t_str2 = nx.tournament_str(t_str)
        ok_(t_str2 == t_str, "tournament str do not agree")
    assert_raises(ValueError, nx.tournament_str, 0)
    assert_raises(ValueError, nx.tournament_str, 9)
    assert_raises(ValueError, nx.tournament_str, 'burn')
    assert_raises(ValueError, nx.tournament_str, None)
Beispiel #7
0
def test_prediction_regression():
    "regression test of prediction performance evaluation"
    d = nx.play_data()
    p = nx.production(nx.logistic(), d, tournament=None, verbosity=0)
    for number, name in nx.tournament_iter():
        p2 = nx.production(nx.logistic(), d, tournament=name, verbosity=0)
        df = p.performance_mean(d['validation'], mean_of='tournament')
        logloss1 = df.loc[name]['logloss']
        logloss2 = p2.summary(d['validation']).loc['mean']['logloss']
        diff = np.abs(logloss1 - logloss2)
        msg = 'failed on {}'.format(name)
        ok_(diff < 1e-6, msg)
Beispiel #8
0
def get_stakes_cutoff(round_number=None):
    """
    Staking confidence cutoff for all tournaments in given round.

    Use this function for `round_number` greater than 112.
    """
    data = []
    for number, name in nx.tournament_iter():
        s, c = get_stakes(round_number, tournament=number)
        data.append([name, c])
    df = pd.DataFrame(data=data, columns=['tourney', 'cutoff'])
    df = df.set_index('tourney')
    return df
Beispiel #9
0
def test_prediction_regression():
    """regression test of prediction performance evaluation"""
    d = nx.play_data()
    p = nx.production(nx.linear(), d, tournament=None, verbosity=0)
    for number, name in nx.tournament_iter():
        p2 = nx.production(nx.linear(), d, tournament=name, verbosity=0)
        df = p.performance_mean(d['validation'], mean_of='tournament')
        logloss1 = float('%.3f' % (df.loc[name]['mse']))
        logloss2 = float('%.3f' %
                         (p2.summary(d['validation']).loc['mean']['mse']))
        diff = np.abs(logloss1 - logloss2)
        msg = f"failed on {name}"
        ok_(diff < 1e-6, msg)
Beispiel #10
0
 def xnew(self, x_array):
     """ Copy of data but with data.x=`x_array`
         must have same number of rows
     """
     if x_array.shape[0] != len(self):
         msg = "`x_array` must have the same number of rows as data"
         raise ValueError(msg)
     n = nx.tournament_count(active_only=True)
     shape = (x_array.shape[0], x_array.shape[1] + n + 2)
     cols = ['x' + str(col) for col in range(1, x_array.shape[1] + 1)]
     cols = ['era', 'region'] + cols
     cols = cols + [
         name for _, name in nx.tournament_iter(active_only=True)
     ]
     df = pd.DataFrame(data=np.empty(shape, dtype=np.float64),
                       index=self.df.index.copy(deep=True),
                       columns=cols)
     df['era'] = self.df['era'].values.copy()
     df['region'] = self.df['region'].values.copy()
     df.values[:, 2:-n] = x_array
     for _, name in nx.tournament_iter(active_only=True):
         df[name] = self.df[name].values.copy()
     return Data(df)
Beispiel #11
0
 def metric_per_tournament(self, data, metric='corr'):
     """DataFrame containing given metric versus tournament"""
     dfs = []
     for t_int, t_name in nx.tournament_iter(active_only=False):
         df, info = metrics_per_name(data,
                                     self,
                                     t_int,
                                     columns=[metric],
                                     split_pairs=False)
         df.columns = [t_name]
         dfs.append(df)
     df = pd.concat(dfs, axis=1)
     df.insert(df.shape[1], 'mean', df.mean(axis=1))
     df = df.sort_values('mean')
     return df
Beispiel #12
0
def get_stakes_users(users, round_number=None):
    """
    Stakes for given users for all tournaments.

    Use this function for `round_number` greater than 112.
    """
    stakes = []
    for number, name in nx.tournament_iter():
        s, p = get_stakes(round_number, tournament=number)
        idx = s.index.isin(users)
        s = s[idx]
        s.insert(0, 'tourney', name)
        stakes.append(s)
    stakes = pd.concat(stakes, axis=0)
    return stakes
Beispiel #13
0
def test_data_y_for_tournament():
    """test data.y_for_tournament"""
    d = nx.load_zip(TINY_DATASET_CSV)

    for number, name in nx.tournament_iter(active_only=True):
        y = np.zeros(14)

        y[0] = y[4] = y[5] = y[9] = 0.75000
        y[1] = y[6] = 0.25000

        y[10:] = np.nan

        yt = d.y[number]
        yt2 = d.y[name]

        assert_array_equal(yt, yt2, f"y{number} indexing corrupted")
        assert_array_equal(yt, y, f"y{number} targets corrupted")
Beispiel #14
0
def load_zip(file_path, verbose=False):
    "Load numerai dataset from zip archive; return Data"

    # load zip
    zf = zipfile.ZipFile(file_path)
    train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0)
    tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0)

    # turn into single dataframe and rename columns
    df = pd.concat([train, tourn], axis=0)
    rename_map = {'data_type': 'region'}
    for i in range(1, N_FEATURES + 1):
        rename_map['feature' + str(i)] = 'x' + str(i)
    for number, name in nx.tournament_iter():
        rename_map['target_' + name] = name
    df.rename(columns=rename_map, inplace=True)

    # convert era, region, and labels to np.float64
    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    n = len(nx.tournament_all())
    df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64')

    # no way we did something wrong, right?
    n = 2 + N_FEATURES + nx.tournament_count()
    if df.shape[1] != n:
        raise IOError("expecting {} columns; found {}".format(n, df.shape[1]))

    # make sure memory is contiguous so that, e.g., data.x is a view
    df = df.copy()

    # to avoid copies we need the dtype of each column to be the same
    if df.dtypes.unique().size != 1:
        raise TypeError("dtype of each column should be the same")

    data = Data(df)
    if verbose:
        print(data)

    return data
Beispiel #15
0
def cutoff_impact(round_number=None,
                  nmrs=[5, 10, 100, 200],
                  is_cutoff=True,
                  is_relative=False):
    """
    Impact of adding stakes of various sizes (nmr) above the cutoff.

    If `is_cutoff` is True (default) then the cutoff is returned; otherwise
    the payout ratio is returned.

    If `is_relative` is False (default) then the absolute cutoff or payout
    ration is given; otherwise a relative value (compared to adding no
    additional stake) is given.
    """
    data = []
    for number, name in nx.tournament_iter():
        s, cutoff = get_stakes(round_number, tournament=number)
        if is_cutoff:
            name = 'cutoff'
            x0 = cutoff
        else:
            name = 'payout'
            x0 = (1 - cutoff) / cutoff
        d = [name, x0]
        for nmr in nmrs:
            c, ignore = calc_cutoff(s, impact_probe_nmr=nmr)
            if is_cutoff:
                x = c
            else:
                x = (1 - c) / c
            if is_relative:
                x = x - x0
            d.append(x)
        data.append(d)
    columns = ['tourney', name] + nmrs
    df = pd.DataFrame(data=data, columns=columns)
    df = df.set_index('tourney')
    return df
Beispiel #16
0
def load_zip(file_path,
             verbose=False,
             include_train=True,
             single_precision=True):
    """
    Load numerai dataset from zip archive; return Data

    It includes train data by default. To work with tournament data only,
    set `include_train` to False.


    Set `single_precision` to True in order to have data in float32 
    (saves memory).
    """

    # load zip
    zf = zipfile.ZipFile(file_path)

    if single_precision:
        # read first 100 rows to scan types
        # then replace all float64 types with float32
        df_test = pd.read_csv(zf.open(TOURNAMENT_FILE),
                              nrows=100,
                              header=0,
                              index_col=0)

        float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
        float32_cols = {c: np.float32 for c in float_cols}

        tourn = pd.read_csv(zf.open(TOURNAMENT_FILE),
                            header=0,
                            index_col=0,
                            engine='c',
                            dtype=float32_cols)

        if include_train:
            train = pd.read_csv(zf.open(TRAIN_FILE),
                                header=0,
                                index_col=0,
                                engine='c',
                                dtype=float32_cols)
            # merge train and tournament data to single dataframe
            df = pd.concat([train, tourn], axis=0)
        else:
            df = tourn
    else:
        # regular parsing, float64 will be used
        tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0)

        if include_train:
            train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0)
            # merge train and tournament data to single dataframe
            df = pd.concat([train, tourn], axis=0)
        else:
            df = tourn

    # rename columns
    rename_map = {'data_type': 'region'}
    for i in range(1, N_FEATURES + 1):
        rename_map['feature' + str(i)] = 'x' + str(i)
    for number, name in nx.tournament_iter(active_only=True):
        rename_map['target'] = name
    df.rename(columns=rename_map, inplace=True)

    # convert era, region, and labels to np.float32 or
    # np.float64 depending on the mode

    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    n = nx.tournament_count(active_only=True)
    if single_precision:
        df.iloc[:, -n:] = df.iloc[:, -n:].astype('float32')
        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype('float32')
    else:
        df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64')

    # no way we did something wrong, right?
    n = 2 + N_FEATURES + nx.tournament_count(active_only=True)
    if df.shape[1] != n:
        raise IOError("expecting {} columns; found {}".format(n, df.shape[1]))

    # make sure memory is contiguous so that, e.g., data.x is a view
    df = df.copy()

    # to avoid copies we need the dtype of each column to be the same
    if df.dtypes.unique().size != 1:
        raise TypeError("dtype of each column should be the same")

    data = Data(df)
    if verbose:
        print(data)

    return data