def _xform_data(self, df): columns = [] tables_ = data.get_tables(self.hypers.arbitrage) percent = self.hypers.pct_change for table in tables_: name, cols, ohlcv = table['name'], table['cols'], table.get( 'ohlcv', {}) columns += [self._diff(df[f'{name}_{k}'], percent) for k in cols] # Add extra indicator columns if ohlcv and self.hypers.indicators: ind = pd.DataFrame() # TA-Lib requires specifically-named columns (OHLCV) for k, v in ohlcv.items(): ind[k] = df[f"{name}_{v}"] columns += [ ## Original indicators from some boilerplate repo I started with self._diff(SMA(ind, timeperiod=15), percent), self._diff(SMA(ind, timeperiod=60), percent), self._diff(RSI(ind, timeperiod=14), percent), self._diff(ATR(ind, timeperiod=14), percent), ## Indicators from the book "How to Day Trade For a Living". Not sure which are more solid... ## Price, Volume, 9-EMA, 20-EMA, 50-SMA, 200-SMA, VWAP, prior-day-close # self._diff(EMA(ind, timeperiod=9)), # self._diff(EMA(ind, timeperiod=20)), # self._diff(SMA(ind, timeperiod=50)), # self._diff(SMA(ind, timeperiod=200)), ] states = np.nan_to_num(np.column_stack(columns)) prices = df[data.target].values # Note: don't scale/normalize here, since we'll normalize w/ self.price/step_acc.cash after each action return states, prices
def xform_data(self, df): """ Some special handling of the price data. First, we don't want prices to be absolute, since we wan't the agent to learn actions _relative_ to states; that is, states need to be transformed into "relative" some how. This is called "stationary time series"; they fluctuate around y=0, like visualizing audio rather than a line graph. Next, we don't want absolute price changes, since that's still not relative enough (prices change in larger amounts when the BTC price is already large - we want to learn the pattern, not the numbers). So the solution is percent-changes. Now - making everything a percent-change from its past makes it so you can track that field's history, but you lose how it relates to the other fields in its cross-section. So here's what we do. Anchor all the price fields to the target (close-price); so they're relative w/i the cross-section. Then set target to its percent-change over time. Leave the volume stuff alone, we _do_ want that absolute. Then scale everything. Crazy, I know; but IMO makes sense. Hit me if you have a better idea. """ columns = [] ind_ct = self.hypers.indicators_count tables_ = data.get_tables(self.hypers.arbitrage) for table in tables_: for col in table['cols']: name_col = f'{table["name"]}_{col}' if name_col == data.target: columns.append(self.diff(df[name_col], True)) elif col in table['price_cols']: columns.append(df[name_col] / df[data.target]) else: columns.append(df[name_col]) # Add extra indicator columns ohlcv = table.get('ohlcv', {}) if ohlcv and ind_ct: ind = pd.DataFrame() # TA-Lib requires specifically-named columns (OHLCV) for k, v in ohlcv.items(): ind[k] = df[f"{name}_{v}"] # Sort these by effectiveness. I'm no expert, so if this seems off please submit a PR! Later after # you've optimized the other hypers, come back here and create a hyper for every indicator you want to # try (zoom in on indicators) best_indicators = [ tlib.MOM, tlib.SMA, # tlib.BBANDS, # TODO signature different; special handling tlib.RSI, tlib.EMA, tlib.ATR ] for i in range(ind_ct): columns.append(best_indicators[i]( ind, timeperiod=self.hypers.indicators_window) / df[data.target]) states = np.column_stack(columns) prices = df[data.target].values # Remove padding at the start of all data. Indicators are aggregate fns, so don't count until we have # that much historical data if ind_ct: states = states[self.hypers.indicators_window:] prices = prices[self.hypers.indicators_window:] # Pre-scale all price actions up-front, since they don't change. We'll scale changing values real-time elsewhere states = preprocessing.robust_scale(states, quantile_range=(1., 99.)) # Reducing the dimensionality of our states (OHLCV + indicators + arbitrage => 5 or 6 weights) # because TensorForce's memory branch changed Policy Gradient models' batching from timesteps to episodes. # This takes of way too much GPU RAM for us, so we had to cut back in quite a few areas (num steps to train # per episode, episode batch_size, and especially states: if self.cli_args.autoencode: ae = AutoEncoder() states = ae.fit_transform_tied(states) return states, prices