Exemple #1
0
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)

    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)
        ohlcv = _df[entry['features']['ohlcv']]
        ta = _df[entry['features']['ta']]

        # Build the dataframe with base features
        ohlc = ohlcv[['open', 'high', 'low', 'close']]
        lagged_ohlc = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')
        # Add lagged features to the dataframe
        atsa_df = pd.concat([lagged_ohlc, ta],
                            axis='columns',
                            verify_integrity=True,
                            sort=True,
                            join='inner')

        # Drop the first 30 rows
        #atsa_df = atsa_df[30:]

        # decompose_dataframe_features('all_merged', _sym+'_improved', unlagged_df)
        # Add symbol to index
        logger.info('Saving {}'.format(_sym))
        save_symbol_dataset(dest_index, _sym, atsa_df, target=_target)
        logger.info('Saved {}'.format(_sym))
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)
    sessionFactory = connect('test_features')
    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)
        ohlcv = _df[entry['features']['ohlcv']]

        ohlcv_d = {
            d: _df[entry['features']['ohlcv_{}d'.format(d)]]
            for d in [3, 7, 30]
        }
        ta_d = {
            d: _df[entry['features']['ta_{}d'.format(d)]]
            for d in [3, 7, 30]
        }

        ta = _df[entry['features']['ta']]
        cm = _df[entry['features']['cm']]

        cm_picked = pd.DataFrame(index=ohlcv.index)
        if 'adractcnt' in cm.columns:
            cm_picked['adractcnt_pct'] = cm.adractcnt.pct_change()
            # cm_picked['adractcnt_mean3_pct'] = cm.adractcnt.rolling(3).mean().pct_change()
            # cm_picked['adractcnt_mean7_pct'] = cm.adractcnt.rolling(7).mean().pct_change()
        # if 'splycur' in cm.columns: ## Correlated with volume and close
        #     cm_picked['vol_supply'] = ohlcv.volume / cm.splycur # Ratio between transacted volume and total supply (mined)
        if 'txtfrvaladjntv' in cm.columns and 'isstotntv' in cm.columns and 'feetotntv' in cm.columns:
            # I want to represent miners earnings (fees + issued coins) vs amount transacted in that interval
            cm_picked['earned_vs_transacted'] = (
                cm.isstotntv + cm.feetotntv) / cm.txtfrvaladjntv
        if 'isstotntv' in cm.columns:
            # isstotntv is total number of coins mined in the time interval
            # splycur is total number of coins mined (all time)
            total_mined = cm.isstotntv.rolling(
                365, min_periods=7).sum()  # total mined in a year
            cm_picked['isstot365_isstot1_pct'] = (cm.isstotntv /
                                                  total_mined).pct_change()
        if 'splycur' in cm.columns and 'isstotntv' in cm.columns:
            cm_picked['splycur_isstot1_pct'] = (cm.isstotntv /
                                                cm.splycur).pct_change()
        if 'hashrate' in cm.columns:
            #cm_picked['hashrate_mean3_pct'] = cm.hashrate.rolling(3).mean().pct_change()
            #cm_picked['hashrate_mean7_pct'] = cm.hashrate.rolling(7).mean().pct_change()
            cm_picked['hashrate_pct'] = cm.hashrate.pct_change()
        if 'roi30d' in cm.columns:
            cm_picked['roi30d'] = cm.roi30d
        if 'isstotntv' in cm.columns:
            cm_picked['isstotntv_pct'] = cm.isstotntv.pct_change()
        if 'feetotntv' in cm.columns:
            cm_picked['feetotntv_pct'] = cm.feetotntv.pct_change()
        if 'txtfrcount' in cm.columns:
            cm_picked['txtfrcount_pct'] = cm.txtfrcount.pct_change()
            #cm_picked['txtfrcount_volume'] = cm.txtfrcount.pct_change()
        if 'vtydayret30d' in cm.columns:
            cm_picked['vtydayret30d'] = cm.vtydayret30d
        if 'isscontpctann' in cm.columns:
            cm_picked['isscontpctann'] = cm.isscontpctann

        ta_picked = pd.DataFrame(index=ta.index)
        # REMA / RSMA are already used and well-estabilished in ATSA,
        # I'm taking the pct change since i want to encode the relative movement of the ema's not their positions
        # ta_picked['rema_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rema_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rema_20_50_pct'] = ta.rema_20_50.pct_change()
        # ta_picked['rsma_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rsma_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rsma_20_50_pct'] = ta.rema_20_50.pct_change()

        # Stoch is a momentum indicator comparing a particular closing price of a security to a range of its prices
        # over a certain period of time.
        # The sensitivity of the oscillator to market movements is reducible by adjusting that time period or
        # by taking a moving average of the result.
        # It is used to generate overbought and oversold trading signals, utilizing a 0-100 bounded range of values.
        # IDEA => decrease sensitivity by 3-mean and divide by 100 to get fp values
        ta_picked['stoch_14_mean3_div100'] = ta.stoch_14.rolling(
            3).mean() / 100

        #Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows
        # the relationship between two moving averages of a security’s price.
        # The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA.
        #  A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line,
        #  which can function as a trigger for buy and sell signals.
        #  Traders may buy the security when the MACD crosses above its signal line and sell - or short - the security
        #  when the MACD crosses below the signal line.
        #  Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways,
        #  but the more common methods are crossovers, divergences, and rapid rises/falls.
        signal_line = builder.exponential_moving_average(ta.macd_12_26, 9)
        ta_picked[
            'macd_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['macd_12_26_diff_signal'] = (
            ta.macd_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['macd_12_26_pct'] = ta.macd_12_26.pct_change(
        )  # Information about slope

        # PPO is identical to the moving average convergence divergence (MACD) indicator,
        # except the PPO measures percentage difference between two EMAs, while the MACD measures absolute (dollar) difference.
        signal_line = builder.exponential_moving_average(ta.ppo_12_26, 9)
        ta_picked[
            'ppo_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['ppo_12_26_diff_signal'] = (
            ta.ppo_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['ppo_12_26_pct'] = ta.ppo_12_26.pct_change(
        )  # Information about slope

        # ADI Accumulation/distribution is a cumulative indicator that uses volume and price to assess whether
        # a stock is being accumulated or distributed.
        # The accumulation/distribution measure seeks to identify divergences between the stock price and volume flow.
        # This provides insight into how strong a trend is. If the price is rising but the indicator is falling
        # this indicates that buying or accumulation volume may not be enough to support
        # the price rise and a price decline could be forthcoming.
        # ==> IDEA: if we can fit a line to the price y1 = m1X+q1 and a line to ADI y2=m2X+q2 then we can identify
        #           divergences by simply looking at the sign of M.
        #           Another insight would be given by the slope (ie pct_change)
        ta_picked['adi_pct'] = ta.adi.pct_change()
        ta_picked['adi_close_convergence'] = convergence_between_series(
            ta.adi, ohlcv.close, 3)

        # RSI goes from 0 to 100, values <= 20 mean BUY, while values >= 80 mean SELL.
        # Dividing it by 100 to get a floating point feature, makes no sense to pct_change it
        ta_picked['rsi_14_div100'] = ta.rsi_14 / 100

        # The Money Flow Index (MFI) is a technical indicator that generates overbought or oversold
        #   signals using both prices and volume data. The oscillator moves between 0 and 100.
        # An MFI reading above 80 is considered overbought and an MFI reading below 20 is considered oversold,
        #   although levels of 90 and 10 are also used as thresholds.
        # A divergence between the indicator and price is noteworthy. For example, if the indicator is rising while
        #   the price is falling or flat, the price could start rising.
        ta_picked['mfi_14_div100'] = ta.mfi_14 / 100

        # The Chande momentum oscillator is a technical momentum indicator similar to other momentum indicators
        #   such as Wilder’s Relative Strength Index (Wilder’s RSI) and the Stochastic Oscillator.
        #   It measures momentum on both up and down days and does not smooth results, triggering more frequent
        #   oversold and overbought penetrations. The indicator oscillates between +100 and -100.
        # Many technical traders add a 10-period moving average to this oscillator to act as a signal line.
        #   The oscillator generates a bullish signal when it crosses above the moving average and a
        #   bearish signal when it drops below the moving average.
        ta_picked['cmo_14_div100'] = ta.cmo_14 / 100
        signal_line = builder.simple_moving_average(ta.cmo_14, 10)
        ta_picked['cmo_14_signal'] = signal_line
        ta_picked['cmo_14_diff_signal'] = (ta.cmo_14 - signal_line) / 100

        # On-balance volume (OBV) is a technical trading momentum indicator that uses volume flow to predict changes in stock price.
        # Eventually, volume drives the price upward. At that point, larger investors begin to sell, and smaller investors begin buying.
        # Despite being plotted on a price chart and measured numerically,
        # the actual individual quantitative value of OBV is not relevant.
        # The indicator itself is cumulative, while the time interval remains fixed by a dedicated starting point,
        # meaning the real number value of OBV arbitrarily depends on the start date.
        # Instead, traders and analysts look to the nature of OBV movements over time;
        # the slope of the OBV line carries all of the weight of analysis. => We want percent change
        ta_picked['obv_pct'] = ta.obv.pct_change()
        ta_picked['obv_mean3_pct'] = ta.obv.rolling(3).mean().pct_change()

        # Strong rallies in price should see the force index rise.
        # During pullbacks and sideways movements, the force index will often fall because the volume
        # and/or the size of the price moves gets smaller.
        # => Encoding the percent variation could be a good idea
        ta_picked['fi_13_pct'] = ta.fi_13.pct_change()
        ta_picked['fi_50_pct'] = ta.fi_50.pct_change()

        # The Aroon Oscillator is a trend-following indicator that uses aspects of the
        # Aroon Indicator (Aroon Up and Aroon Down) to gauge the strength of a current trend
        # and the likelihood that it will continue.
        # It moves between -100 and 100. A high oscillator value is an indication of an uptrend
        # while a low oscillator value is an indication of a downtrend.
        ta_picked['ao_14'] = ta.ao_14 / 100

        # The average true range (ATR) is a technical analysis indicator that measures market volatility
        #   by decomposing the entire range of an asset price for that period.
        # ATRP is pct_change of volatility
        ta_picked['atrp_14'] = ta.atrp_14

        # Percentage Volume Oscillator (PVO) is momentum volume oscillator used in technical analysis
        #   to evaluate and measure volume surges and to compare trading volume to the average longer-term volume.
        # PVO does not analyze price and it is based solely on volume.
        #  It compares fast and slow volume moving averages by showing how short-term volume differs from
        #  the average volume over longer-term.
        #  Since it does not care a trend's factor in its calculation (only volume data are used)
        #  this technical indicator cannot be used alone to predict changes in a trend.
        ta_picked['pvo_12_26'] = ta.pvo_12_26

        # IGNORED: tsi, wd, adx,

        #lagged_stats = pd.concat([ohlcv_stats] + [builder.make_lagged(ohlcv_stats, i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')

        # Build the dataframe with base features
        # lagged_close = pd.concat([ohlcv.close.pct_change()] + [builder.make_lagged(ohlcv.close.pct_change(), i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')
        # lagged_close.columns = ['close_pct'] + ['close_pct_lag-{}'.format(i) for i in range(1, W +1)]

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']].pct_change()
        ohlc.columns = ['{}_pct'.format(c) for c in ohlcv.columns]
        lagged_ohlc_pct = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')

        _time = pd.DataFrame(index=ohlcv.index)
        _time['day_of_year'] = ohlcv.index.dayofyear
        _time['day_of_week'] = ohlcv.index.dayofweek

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']]
        x_space = np.linspace(0, ohlc.index.size, ohlc.index.size)
        _splines = pd.DataFrame(index=ohlcv.index)

        # Highly correlated between themselves, no use
        # _splines['open_spl'] = get_spline(ohlc.open, 0)
        # _splines['high_spl'] = get_spline(ohlc.high, 0)
        # _splines['low_spl'] = get_spline(ohlc.low, 0)
        # _splines['close_spl'] = get_spline(ohlc.close, 0)

        _splines['open_spl_d1'] = builder.get_spline(ohlc.open, 1)
        _splines['high_spl_d1'] = builder.get_spline(ohlc.high, 1)
        _splines['low_spl_d1'] = builder.get_spline(ohlc.low, 1)
        _splines['close_spl_d1'] = builder.get_spline(ohlc.close, 1)

        _splines['open_spl_d2'] = builder.get_spline(ohlc.open, 2)
        _splines['high_spl_d2'] = builder.get_spline(ohlc.high, 2)
        _splines['low_spl_d2'] = builder.get_spline(ohlc.low, 2)
        _splines['close_spl_d2'] = builder.get_spline(ohlc.close, 2)

        _patterns = builder.get_talib_patterns(ohlcv)
        _new_features = pd.DataFrame(index=ohlcv.index)
        _new_features['candlestick_patterns_mean'] = _patterns.mean(axis=1)
        _new_features['candlestick_patterns_sum'] = _patterns.sum(axis=1)
        # WE LIKE THESE TWO!!!!
        _new_features['close_volatility_7d'] = ohlcv.close.pct_change(
        ).rolling(7).std(ddof=0)
        _new_features['close_volatility_30d'] = ohlcv.close.pct_change(
        ).rolling(30).std(ddof=0)
        #
        # Candle body size variation, for example
        _new_features['close_open_pct'] = (
            ohlcv.close - ohlcv.open
        ).pct_change()  # Change in body of the candle (> 0 if candle is green)
        _new_features['high_close_dist_pct'] = (
            ohlcv.high - ohlcv.close
        ).pct_change(
        )  # Change in wick size of the candle, shorter wick should be bullish
        _new_features['low_close_dist_pct'] = (
            ohlcv.close - ohlcv.low
        ).pct_change(
        )  # Change in shadow size of the candle, this increasing would indicate support (maybe a bounce)
        _new_features['high_low_dist_pct'] = (
            ohlcv.high - ohlcv.low
        ).pct_change(
        )  # Change in total candle size, smaller candles stands for low volatility

        for d in [3, 7, 30]:
            ohlcv_d[d].columns = ['close', 'high', 'low', 'open', 'volume']
            _new_features['close_open_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].open).pct_change()
            _new_features['high_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].close).pct_change()
            _new_features['low_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].low).pct_change()
            _new_features['high_low_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].low).pct_change()

        _ta_windowed_features = pd.concat([
            v.rename(columns={c: '{}_ta{}d'.format(c, d)
                              for c in v.columns}) for d, v in ta_d.items()
        ],
                                          axis=1)
        # Add lagged features to the dataframe
        ta.columns = ['{}_ta1d'.format(c) for c in ta.columns]
        feature_groups = [
            _new_features, _splines, lagged_ohlc_pct, cm_picked, ta_picked,
            _ta_windowed_features, ta
        ]

        improved_df = pd.concat(feature_groups,
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')

        # Drop the first 30 rows
        improved_df = improved_df[30:]
        # Drop columns whose values are all nan or inf
        with pd.option_context('mode.use_inf_as_na',
                               True):  # Set option temporarily
            improved_df = improved_df.dropna(axis='columns', how='all')
        logger.info('Saving {}'.format(_sym))
        for c in improved_df.columns:
            # session, group, symbol, name, series
            s = sessionFactory()
            add_feature(s, 'dbfeaturetest', _sym, c, improved_df[c])
            s.commit()
        #save_symbol_dataset(dest_index, _sym, improved_df, target=_target)
        logger.info('Saved {}'.format(_sym))
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)
    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)

        ta = _df[entry['features']['ta']]
        cm = _df[entry['features']['cm']]

        # Price history facet (Daily variation of ohlc in last W trading days)
        ohlc = _df.loc[:, ['open', 'high', 'low', 'close']]
        ohlc['open'] = STL(ohlc.open).fit().resid
        ohlc['high'] = STL(ohlc.high).fit().resid
        ohlc['low'] = STL(ohlc.low).fit().resid
        ohlc['close'] = STL(ohlc.close).fit().resid
        ohlc.columns = ['open_resid', 'high_resid', 'low_resid', 'close_resid']
        history_facet = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')
        # Price trend facet (REMA/RSMA, MACD, AO, ADX, WD+ - WD-)
        trend_facet = ta[[
            "rsma_5_20", "rsma_8_15", "rsma_20_50", "rema_5_20", "rema_8_15",
            "rema_20_50", "macd_12_26", "ao_14", "adx_14", "wd_14"
        ]]
        # Volatility facet (CMO, ATRp)
        volatility_facet = ta[["cmo_14", "atrp_14"]]
        # Volume facet (Volume pct, PVO, ADI, OBV)
        volume_facet = pd.concat([
            _df.volume.pct_change().replace([np.inf, -np.inf], 0),
            ta[["pvo_12_26", "adi", "obv"]]
        ],
                                 axis='columns',
                                 verify_integrity=True,
                                 sort=True,
                                 join='inner')
        # On-chain facet
        cm_1 = cm.reindex(columns=[
            'adractcnt', 'txtfrvaladjntv', 'isstotntv', 'feetotntv', 'splycur',
            'hashrate', 'difficulty', 'txtfrcount'
        ]).pct_change()
        cm_2 = cm.reindex(columns=['isscontpctann'])
        chain_facet = pd.concat([cm_1, cm_2],
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')

        # Drop columns whose values are all nan or inf from each facet
        with pd.option_context('mode.use_inf_as_na',
                               True):  # Set option temporarily
            history_facet = history_facet.dropna(axis='columns', how='all')
            trend_facet = trend_facet.dropna(axis='columns', how='all')
            volatility_facet = volatility_facet.dropna(axis='columns',
                                                       how='all')
            volume_facet = volume_facet.dropna(axis='columns', how='all')
            chain_facet = chain_facet.dropna(axis='columns', how='all')

        improved_df = pd.concat([
            history_facet, trend_facet, volatility_facet, volume_facet,
            chain_facet
        ],
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')
        # Drop the first 30 rows
        #improved_df = improved_df[30:]

        # Add symbol to index
        feature_groups = {
            'price_history': [c for c in history_facet.columns],
            'trend': [c for c in trend_facet.columns],
            'volatility': [c for c in volatility_facet.columns],
            'volume': [c for c in volume_facet.columns],
            'chain': [c for c in chain_facet.columns],
        }
        logger.info('Saving {}'.format(_sym))
        save_symbol_dataset(dest_index,
                            _sym,
                            improved_df,
                            feature_groups=feature_groups,
                            target=_target)
        logger.info('Saved {}'.format(_sym))