def y_sum_hist(self): """Histogram data of sum of y targets across tournaments as dataframe""" s = np.array(self.y[:].sum(axis=1), dtype='float64') s = s[np.isfinite(s)] data = [] for si in range(nx.tournament_count() + 1): data.append((si, (s == si).mean())) df = pd.DataFrame(data=data, columns=['ysum', 'fraction']) df = df.set_index('ysum') return df
def load_zip(file_path, verbose=False): "Load numerai dataset from zip archive; return Data" # load zip zf = zipfile.ZipFile(file_path) train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0) tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0) # turn into single dataframe and rename columns df = pd.concat([train, tourn], axis=0) rename_map = {'data_type': 'region'} for i in range(1, N_FEATURES + 1): rename_map['feature' + str(i)] = 'x' + str(i) for number, name in nx.tournament_iter(active_only=True): rename_map['target_' + name] = name df.rename(columns=rename_map, inplace=True) # convert era, region, and labels to np.float64 df['era'] = df['era'].map(ERA_STR_TO_FLOAT) df['region'] = df['region'].map(REGION_STR_TO_FLOAT) n = nx.tournament_count(active_only=True) df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64') # no way we did something wrong, right? n = 2 + N_FEATURES + nx.tournament_count(active_only=True) if df.shape[1] != n: raise IOError("expecting {} columns; found {}".format(n, df.shape[1])) # make sure memory is contiguous so that, e.g., data.x is a view df = df.copy() # to avoid copies we need the dtype of each column to be the same if df.dtypes.unique().size != 1: raise TypeError("dtype of each column should be the same") data = Data(df) if verbose: print(data) return data
def y_similarity(self): """Similarity (fraction of y's equal) matrix as dataframe""" cols = [] n = nx.tournament_count() s = np.ones((n, n)) for i in range(1, n + 1): cols.append(nx.tournament_str(i)) for j in range(i + 1, n + 1): yi = self.y[i] yj = self.y[j] idx = np.isfinite(yi + yj) yi = yi[idx] yj = yj[idx] sij = (yi == yj).mean() s[i - 1, j - 1] = sij s[j - 1, i - 1] = sij df = pd.DataFrame(data=s, columns=cols, index=cols) return df
def xnew(self, x_array): "Copy of data but with data.x=`x_array`; must have same number of rows" if x_array.shape[0] != len(self): msg = "`x_array` must have the same number of rows as data" raise ValueError(msg) n = nx.tournament_count() shape = (x_array.shape[0], x_array.shape[1] + n + 2) cols = ['x'+str(i) for i in range(x_array.shape[1])] cols = ['era', 'region'] + cols cols = cols + [name for number, name in nx.tournament_iter()] df = pd.DataFrame(data=np.empty(shape, dtype=np.float64), index=self.df.index.copy(deep=True), columns=cols) df['era'] = self.df['era'].values.copy() df['region'] = self.df['region'].values.copy() df.values[:, 2:-n] = x_array for number, name in nx.tournament_iter(): df[name] = self.df[name].values.copy() return Data(df)
def __getitem__(self2, index): n = nx.tournament_count(active_only=False) if isinstance(index, str): if index in nx.tournament_all(as_str=True, active_only=True): return self2.df[index].values else: raise IndexError('string index not recognized') elif nx.isint(index): if index < 1 or index > n: txt = 'tournament number must be between 1 and {}' raise IndexError(txt.format(n)) return self2.df[nx.tournament_str(index)].values elif isinstance(index, slice): if (index.start is None and index.stop is None and index.step is None): # slicing below means a view is returned instead of a copy return self2.df.iloc[:, -n:].values else: raise IndexError('Start, stop, and step of slice must be None') else: raise IndexError('indexing type not recognized')
def load_zip(file_path, verbose=False, include_train=True, single_precision=True): """ Load numerai dataset from zip archive; return Data It includes train data by default. To work with tournament data only, set `include_train` to False. Set `single_precision` to True in order to have data in float32 (saves memory). """ # load zip zf = zipfile.ZipFile(file_path) if single_precision: # read first 100 rows to scan types # then replace all float64 types with float32 df_test = pd.read_csv(zf.open(TOURNAMENT_FILE), nrows=100, header=0, index_col=0) float_cols = [c for c in df_test if df_test[c].dtype == "float64"] float32_cols = {c: np.float32 for c in float_cols} tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0, engine='c', dtype=float32_cols) if include_train: train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0, engine='c', dtype=float32_cols) # merge train and tournament data to single dataframe df = pd.concat([train, tourn], axis=0) else: df = tourn else: # regular parsing, float64 will be used tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0) if include_train: train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0) # merge train and tournament data to single dataframe df = pd.concat([train, tourn], axis=0) else: df = tourn # rename columns rename_map = {'data_type': 'region'} for i in range(1, N_FEATURES + 1): rename_map['feature' + str(i)] = 'x' + str(i) for number, name in nx.tournament_iter(active_only=True): rename_map['target'] = name df.rename(columns=rename_map, inplace=True) # convert era, region, and labels to np.float32 or # np.float64 depending on the mode df['era'] = df['era'].map(ERA_STR_TO_FLOAT) df['region'] = df['region'].map(REGION_STR_TO_FLOAT) n = nx.tournament_count(active_only=True) if single_precision: df.iloc[:, -n:] = df.iloc[:, -n:].astype('float32') df.iloc[:, 0:2] = df.iloc[:, 0:2].astype('float32') else: df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64') # no way we did something wrong, right? n = 2 + N_FEATURES + nx.tournament_count(active_only=True) if df.shape[1] != n: raise IOError("expecting {} columns; found {}".format(n, df.shape[1])) # make sure memory is contiguous so that, e.g., data.x is a view df = df.copy() # to avoid copies we need the dtype of each column to be the same if df.dtypes.unique().size != 1: raise TypeError("dtype of each column should be the same") data = Data(df) if verbose: print(data) return data
def x(self): """View of features, x, as a numpy float array""" n = nx.tournament_count(active_only=True) return self.df.iloc[:, 2:-n]
def x(self): "View of features, x, as a numpy float array" n = nx.tournament_count() return self.df.iloc[:, 2:-n].values