コード例 #1
0
ファイル: data.py プロジェクト: CSP197/numerox
 def y_sum_hist(self):
     """Histogram data of sum of y targets across tournaments as dataframe"""
     s = np.array(self.y[:].sum(axis=1), dtype='float64')
     s = s[np.isfinite(s)]
     data = []
     for si in range(nx.tournament_count() + 1):
         data.append((si, (s == si).mean()))
     df = pd.DataFrame(data=data, columns=['ysum', 'fraction'])
     df = df.set_index('ysum')
     return df
コード例 #2
0
def load_zip(file_path, verbose=False):
    "Load numerai dataset from zip archive; return Data"

    # load zip
    zf = zipfile.ZipFile(file_path)
    train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0)
    tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0)

    # turn into single dataframe and rename columns
    df = pd.concat([train, tourn], axis=0)
    rename_map = {'data_type': 'region'}
    for i in range(1, N_FEATURES + 1):
        rename_map['feature' + str(i)] = 'x' + str(i)
    for number, name in nx.tournament_iter(active_only=True):
        rename_map['target_' + name] = name
    df.rename(columns=rename_map, inplace=True)

    # convert era, region, and labels to np.float64
    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    n = nx.tournament_count(active_only=True)
    df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64')

    # no way we did something wrong, right?
    n = 2 + N_FEATURES + nx.tournament_count(active_only=True)
    if df.shape[1] != n:
        raise IOError("expecting {} columns; found {}".format(n, df.shape[1]))

    # make sure memory is contiguous so that, e.g., data.x is a view
    df = df.copy()

    # to avoid copies we need the dtype of each column to be the same
    if df.dtypes.unique().size != 1:
        raise TypeError("dtype of each column should be the same")

    data = Data(df)
    if verbose:
        print(data)

    return data
コード例 #3
0
ファイル: data.py プロジェクト: CSP197/numerox
 def y_similarity(self):
     """Similarity (fraction of y's equal) matrix as dataframe"""
     cols = []
     n = nx.tournament_count()
     s = np.ones((n, n))
     for i in range(1, n + 1):
         cols.append(nx.tournament_str(i))
         for j in range(i + 1, n + 1):
             yi = self.y[i]
             yj = self.y[j]
             idx = np.isfinite(yi + yj)
             yi = yi[idx]
             yj = yj[idx]
             sij = (yi == yj).mean()
             s[i - 1, j - 1] = sij
             s[j - 1, i - 1] = sij
     df = pd.DataFrame(data=s, columns=cols, index=cols)
     return df
コード例 #4
0
ファイル: data.py プロジェクト: stjordanis/numerox
 def xnew(self, x_array):
     "Copy of data but with data.x=`x_array`; must have same number of rows"
     if x_array.shape[0] != len(self):
         msg = "`x_array` must have the same number of rows as data"
         raise ValueError(msg)
     n = nx.tournament_count()
     shape = (x_array.shape[0], x_array.shape[1] + n + 2)
     cols = ['x'+str(i) for i in range(x_array.shape[1])]
     cols = ['era', 'region'] + cols
     cols = cols + [name for number, name in nx.tournament_iter()]
     df = pd.DataFrame(data=np.empty(shape, dtype=np.float64),
                       index=self.df.index.copy(deep=True),
                       columns=cols)
     df['era'] = self.df['era'].values.copy()
     df['region'] = self.df['region'].values.copy()
     df.values[:, 2:-n] = x_array
     for number, name in nx.tournament_iter():
         df[name] = self.df[name].values.copy()
     return Data(df)
コード例 #5
0
ファイル: data.py プロジェクト: quantverse/numerox
 def __getitem__(self2, index):
     n = nx.tournament_count(active_only=False)
     if isinstance(index, str):
         if index in nx.tournament_all(as_str=True, active_only=True):
             return self2.df[index].values
         else:
             raise IndexError('string index not recognized')
     elif nx.isint(index):
         if index < 1 or index > n:
             txt = 'tournament number must be between 1 and {}'
             raise IndexError(txt.format(n))
         return self2.df[nx.tournament_str(index)].values
     elif isinstance(index, slice):
         if (index.start is None and index.stop is None
                 and index.step is None):
             # slicing below means a view is returned instead of a copy
             return self2.df.iloc[:, -n:].values
         else:
             raise IndexError('Start, stop, and step of slice must be None')
     else:
         raise IndexError('indexing type not recognized')
コード例 #6
0
ファイル: data.py プロジェクト: CSP197/numerox
def load_zip(file_path,
             verbose=False,
             include_train=True,
             single_precision=True):
    """
    Load numerai dataset from zip archive; return Data

    It includes train data by default. To work with tournament data only,
    set `include_train` to False.


    Set `single_precision` to True in order to have data in float32 
    (saves memory).
    """

    # load zip
    zf = zipfile.ZipFile(file_path)

    if single_precision:
        # read first 100 rows to scan types
        # then replace all float64 types with float32
        df_test = pd.read_csv(zf.open(TOURNAMENT_FILE),
                              nrows=100,
                              header=0,
                              index_col=0)

        float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
        float32_cols = {c: np.float32 for c in float_cols}

        tourn = pd.read_csv(zf.open(TOURNAMENT_FILE),
                            header=0,
                            index_col=0,
                            engine='c',
                            dtype=float32_cols)

        if include_train:
            train = pd.read_csv(zf.open(TRAIN_FILE),
                                header=0,
                                index_col=0,
                                engine='c',
                                dtype=float32_cols)
            # merge train and tournament data to single dataframe
            df = pd.concat([train, tourn], axis=0)
        else:
            df = tourn
    else:
        # regular parsing, float64 will be used
        tourn = pd.read_csv(zf.open(TOURNAMENT_FILE), header=0, index_col=0)

        if include_train:
            train = pd.read_csv(zf.open(TRAIN_FILE), header=0, index_col=0)
            # merge train and tournament data to single dataframe
            df = pd.concat([train, tourn], axis=0)
        else:
            df = tourn

    # rename columns
    rename_map = {'data_type': 'region'}
    for i in range(1, N_FEATURES + 1):
        rename_map['feature' + str(i)] = 'x' + str(i)
    for number, name in nx.tournament_iter(active_only=True):
        rename_map['target'] = name
    df.rename(columns=rename_map, inplace=True)

    # convert era, region, and labels to np.float32 or
    # np.float64 depending on the mode

    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    n = nx.tournament_count(active_only=True)
    if single_precision:
        df.iloc[:, -n:] = df.iloc[:, -n:].astype('float32')
        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype('float32')
    else:
        df.iloc[:, -n:] = df.iloc[:, -n:].astype('float64')

    # no way we did something wrong, right?
    n = 2 + N_FEATURES + nx.tournament_count(active_only=True)
    if df.shape[1] != n:
        raise IOError("expecting {} columns; found {}".format(n, df.shape[1]))

    # make sure memory is contiguous so that, e.g., data.x is a view
    df = df.copy()

    # to avoid copies we need the dtype of each column to be the same
    if df.dtypes.unique().size != 1:
        raise TypeError("dtype of each column should be the same")

    data = Data(df)
    if verbose:
        print(data)

    return data
コード例 #7
0
ファイル: data.py プロジェクト: CSP197/numerox
 def x(self):
     """View of features, x, as a numpy float array"""
     n = nx.tournament_count(active_only=True)
     return self.df.iloc[:, 2:-n]
コード例 #8
0
ファイル: data.py プロジェクト: stjordanis/numerox
 def x(self):
     "View of features, x, as a numpy float array"
     n = nx.tournament_count()
     return self.df.iloc[:, 2:-n].values