def data_loader_hook(epoch): # Reload data loader for epoch if args.preprocessed_epoch_data: print('| epoch %d, Loading data:' % epoch) for key in {'train', 'valid'}: # Load validation data only once if key == 'valid' and 'valid' in loaders: break loaders[key] = get_data_loader( load_dataset(args, key, defense, epoch=epoch), batchsize=args.batchsize, device=args.device, shuffle=True, ) # if data needs to be loaded only once and is not yet loaded elif len(loaders) == 0: print('| epoch %d, Loading data:' % epoch) for key in {'train', 'valid'}: loaders[key] = get_data_loader( load_dataset(args, key, defense), batchsize=args.batchsize, device=args.device, shuffle=True, ) return loaders['train']
def create_faiss_patches(args): # load image dataset: print('| set up image loader...') image_dataset = load_dataset(args, 'train', None, with_transformation=False) image_dataset.imgs = image_dataset.imgs[:20000] # we don't need all images # gather image patches: print('| gather image patches...') patches = gather_patches( image_dataset, args.num_patches, args.patch_size, patch_transform=None, ) # build faiss index: print('| training faiss index...') faiss_index, sub_index = index_patches(patches, pca_dims=args.pca_dims) # NOTE: Keep reference to sub_index to prevent it from being GC'ed # save faiss index and patches: print('| writing faiss index to %s' % args.index_file) faiss.write_index(faiss_index, args.index_file) with open(args.patches_file, 'wb') as fwrite: print('| writing patches to %s' % args.patches_file) pickle.dump(patches, fwrite, pickle.HIGHEST_PROTOCOL) print('| done.')
def __init__(self, summary_writer, settings, data_dir=None, manual=False): """ Setup the values and variables for Training. Args: summary_writer: tf.summary.SummaryWriter object to write summaries for Tensorboard. settings: settings object for fetching data from config files. data_dir (default: None): path where the data downloaded should be stored / accessed. manual (default: False): boolean to represent if data_dir is a manual dir. """ self.settings = settings self.summary_writer = summary_writer self.iterations = self.settings["iterations"] dataset_args = self.settings["dataset"] if not manual: self.dataset = dataset.load_dataset( dataset_args["name"], dataset.scale_down(method=dataset_args["scale_method"], dimension=dataset_args["hr_dimension"]), batch_size=settings["batch_size"], data_dir=data_dir) else: self.dataset = dataset.load_dataset_directory( dataset_args["name"], data_dir, dataset.scale_down(method=dataset_args["scale_method"], dimension=dataset_args["hr_dimension"]), batch_size=settings["batch_size"])
def generate_tf_record(data_dir, raw_data=False, tfrecord_path="serialized_dataset", num_shards=8): teacher_sett = settings.Settings(use_student_settings=False) student_sett = settings.Settings(use_student_settings=True) dataset_args = teacher_sett["dataset"] if dataset_args["name"].lower().strip() == "div2k": assert len(data_dir) == 2 ds = dataset.load_div2k_dataset(data_dir[0], data_dir[1], student_sett["hr_size"], shuffle=True) elif raw_data: ds = dataset.load_dataset_directory( dataset_args["name"], data_dir, dataset.scale_down(method=dataset_args["scale_method"], size=student_sett["hr_size"])) else: ds = dataset.load_dataset(dataset_args["name"], dataset.scale_down( method=dataset_args["scale_method"], size=student_sett["hr_size"]), data_dir=data_dir) to_tfrecord(ds, tfrecord_path, num_shards)
def __init__(self, summary_writer, summary_writer_2, settings, model_dir="", data_dir=None, manual=False, strategy=None): """ Setup the values and variables for Training. Args: summary_writer: tf.summary.SummaryWriter object to write summaries for Tensorboard. settings: settings object for fetching data from config files. data_dir (default: None): path where the data downloaded should be stored / accessed. manual (default: False): boolean to represent if data_dir is a manual dir. """ self.settings = settings self.model_dir = model_dir self.summary_writer = summary_writer self.summary_writer_2 = summary_writer_2 self.strategy = strategy dataset_args = self.settings["dataset"] augment_dataset = dataset.augment_image(saturation=None) self.batch_size = self.settings["batch_size"] hr_size = tf.convert_to_tensor( [dataset_args["hr_dimension"], dataset_args["hr_dimension"], 3]) lr_size = tf.cast(hr_size, tf.float32) * \ tf.convert_to_tensor([1 / 4, 1 / 4, 1], tf.float32) lr_size = tf.cast(lr_size, tf.int32) if isinstance(strategy, tf.distribute.Strategy): self.dataset = (dataset.load_tfrecord_dataset( tfrecord_path=data_dir, lr_size=lr_size, hr_size=hr_size).repeat().map(augment_dataset).batch( self.batch_size, drop_remainder=True)) self.dataset = iter( strategy.experimental_distribute_dataset(self.dataset)) else: if not manual: self.dataset = iter( dataset.load_dataset( dataset_args["name"], dataset.scale_down( method=dataset_args["scale_method"], dimension=dataset_args["hr_dimension"]), batch_size=settings["batch_size"], data_dir=data_dir, augment=True, shuffle=True)) else: self.dataset = iter( dataset.load_dataset_directory( dataset_args["name"], data_dir, dataset.scale_down( method=dataset_args["scale_method"], dimension=dataset_args["hr_dimension"]), batch_size=settings["batch_size"], augment=True, shuffle=True))
def _load_partial_dataset(args, data_type, defense, adv_params): start_idx, end_idx = _get_start_end_index(args) data_indices = {'start_idx': start_idx, 'end_idx': end_idx} dataset = load_dataset(args, data_type, defense, adv_params, data_indices=data_indices) return dataset
def generate_transformed_images(args): # Only runs one method at a time assert args.operation is not None, \ "operation to run can't be None" assert OperationType.has_value(args.operation), \ "\"{}\" operation not defined".format(args.operation) assert args.defenses is not None, "Defenses can't be None" assert not args.preprocessed_data, \ "Trying to apply transformations on already transformed images" if args.operation == str(OperationType.TRANSFORM_ADVERSARIAL): for idx, defense_name in enumerate(args.defenses): defense = get_defense(defense_name, args) adv_params = constants.get_adv_params(args, idx) print("| adv_params: ", adv_params) dataset = _load_partial_dataset(args, 'valid', defense, adv_params) if args.data_batches is None: transformation_on_adv(args, dataset, defense_name, adv_params) else: for i in range(args.data_batches): transformation_on_adv(args, dataset, defense_name, adv_params, data_batch_idx=i) elif args.operation == str(OperationType.CAT_DATA): for idx, defense_name in enumerate(args.defenses): adv_params = constants.get_adv_params(args, idx) print("| adv_params: ", adv_params) if args.data_batches is None: concatenate_data(args, defense_name, adv_params) else: for i in range(args.data_batches): concatenate_data(args, defense_name, adv_params, data_batch_idx=i) elif args.operation == str(OperationType.TRANSFORM_RAW): start_class_idx = args.partition * args.partition_size end_class_idx = (args.partition + 1) * args.partition_size class_indices = range(start_class_idx, end_class_idx) for defense_name in args.defenses: defense = get_defense(defense_name, args) data_type = args.data_type if args.data_type == "train" else "valid" dataset = load_dataset(args, data_type, defense, class_indices=class_indices) transformation_on_raw(args, dataset, defense_name)
def build(source_index, dest_index, W=10): _dataset = load_dataset(source_index, return_index=True) for _sym, entry in _dataset.items(): _df = pd.read_csv(entry['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) _target = pd.read_csv(entry['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) ohlcv = _df[entry['features']['ohlcv']] ta = _df[entry['features']['ta']] # Build the dataframe with base features ohlc = ohlcv[['open', 'high', 'low', 'close']] lagged_ohlc = pd.concat( [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)], axis='columns', verify_integrity=True, sort=True, join='inner') # Add lagged features to the dataframe atsa_df = pd.concat([lagged_ohlc, ta], axis='columns', verify_integrity=True, sort=True, join='inner') # Drop the first 30 rows #atsa_df = atsa_df[30:] # decompose_dataframe_features('all_merged', _sym+'_improved', unlagged_df) # Add symbol to index logger.info('Saving {}'.format(_sym)) save_symbol_dataset(dest_index, _sym, atsa_df, target=_target) logger.info('Saved {}'.format(_sym))
def create_faiss_patches(args): # load image dataset: print('| set up image loader...') image_dataset = load_dataset(args, 'train', None, with_transformation=True) image_dataset.imgs = image_dataset.imgs[:20000] # we don't need all images # gather image patches: print('| gather image patches...') patches = gather_patches( image_dataset, args.num_patches, args.quilting_patch_size, patch_transform=None, ) # build faiss index: print('| training faiss index...') index_patches(patches, args.index_file, pca_dims=args.pca_dims) # save patches: with open(args.patches_file, 'wb') as fwrite: print('| writing patches to %s' % args.patches_file) pickle.dump(patches, fwrite, pickle.HIGHEST_PROTOCOL)
def build(source_index, dest_index, W=10): _dataset = load_dataset(source_index, return_index=True) sessionFactory = connect('test_features') for _sym, entry in _dataset.items(): _df = pd.read_csv(entry['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) _target = pd.read_csv(entry['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) ohlcv = _df[entry['features']['ohlcv']] ohlcv_d = { d: _df[entry['features']['ohlcv_{}d'.format(d)]] for d in [3, 7, 30] } ta_d = { d: _df[entry['features']['ta_{}d'.format(d)]] for d in [3, 7, 30] } ta = _df[entry['features']['ta']] cm = _df[entry['features']['cm']] cm_picked = pd.DataFrame(index=ohlcv.index) if 'adractcnt' in cm.columns: cm_picked['adractcnt_pct'] = cm.adractcnt.pct_change() # cm_picked['adractcnt_mean3_pct'] = cm.adractcnt.rolling(3).mean().pct_change() # cm_picked['adractcnt_mean7_pct'] = cm.adractcnt.rolling(7).mean().pct_change() # if 'splycur' in cm.columns: ## Correlated with volume and close # cm_picked['vol_supply'] = ohlcv.volume / cm.splycur # Ratio between transacted volume and total supply (mined) if 'txtfrvaladjntv' in cm.columns and 'isstotntv' in cm.columns and 'feetotntv' in cm.columns: # I want to represent miners earnings (fees + issued coins) vs amount transacted in that interval cm_picked['earned_vs_transacted'] = ( cm.isstotntv + cm.feetotntv) / cm.txtfrvaladjntv if 'isstotntv' in cm.columns: # isstotntv is total number of coins mined in the time interval # splycur is total number of coins mined (all time) total_mined = cm.isstotntv.rolling( 365, min_periods=7).sum() # total mined in a year cm_picked['isstot365_isstot1_pct'] = (cm.isstotntv / total_mined).pct_change() if 'splycur' in cm.columns and 'isstotntv' in cm.columns: cm_picked['splycur_isstot1_pct'] = (cm.isstotntv / cm.splycur).pct_change() if 'hashrate' in cm.columns: #cm_picked['hashrate_mean3_pct'] = cm.hashrate.rolling(3).mean().pct_change() #cm_picked['hashrate_mean7_pct'] = cm.hashrate.rolling(7).mean().pct_change() cm_picked['hashrate_pct'] = cm.hashrate.pct_change() if 'roi30d' in cm.columns: cm_picked['roi30d'] = cm.roi30d if 'isstotntv' in cm.columns: cm_picked['isstotntv_pct'] = cm.isstotntv.pct_change() if 'feetotntv' in cm.columns: cm_picked['feetotntv_pct'] = cm.feetotntv.pct_change() if 'txtfrcount' in cm.columns: cm_picked['txtfrcount_pct'] = cm.txtfrcount.pct_change() #cm_picked['txtfrcount_volume'] = cm.txtfrcount.pct_change() if 'vtydayret30d' in cm.columns: cm_picked['vtydayret30d'] = cm.vtydayret30d if 'isscontpctann' in cm.columns: cm_picked['isscontpctann'] = cm.isscontpctann ta_picked = pd.DataFrame(index=ta.index) # REMA / RSMA are already used and well-estabilished in ATSA, # I'm taking the pct change since i want to encode the relative movement of the ema's not their positions # ta_picked['rema_5_20_pct'] = ta.rema_5_20.pct_change() ta_picked['rema_8_15_pct'] = ta.rema_8_15.pct_change() # ta_picked['rema_20_50_pct'] = ta.rema_20_50.pct_change() # ta_picked['rsma_5_20_pct'] = ta.rema_5_20.pct_change() ta_picked['rsma_8_15_pct'] = ta.rema_8_15.pct_change() # ta_picked['rsma_20_50_pct'] = ta.rema_20_50.pct_change() # Stoch is a momentum indicator comparing a particular closing price of a security to a range of its prices # over a certain period of time. # The sensitivity of the oscillator to market movements is reducible by adjusting that time period or # by taking a moving average of the result. # It is used to generate overbought and oversold trading signals, utilizing a 0-100 bounded range of values. # IDEA => decrease sensitivity by 3-mean and divide by 100 to get fp values ta_picked['stoch_14_mean3_div100'] = ta.stoch_14.rolling( 3).mean() / 100 #Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows # the relationship between two moving averages of a security’s price. # The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA. # A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line, # which can function as a trigger for buy and sell signals. # Traders may buy the security when the MACD crosses above its signal line and sell - or short - the security # when the MACD crosses below the signal line. # Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways, # but the more common methods are crossovers, divergences, and rapid rises/falls. signal_line = builder.exponential_moving_average(ta.macd_12_26, 9) ta_picked[ 'macd_12_26_signal'] = signal_line # Relationship with signal line ta_picked['macd_12_26_diff_signal'] = ( ta.macd_12_26 - signal_line).pct_change() # Relationship with signal line ta_picked['macd_12_26_pct'] = ta.macd_12_26.pct_change( ) # Information about slope # PPO is identical to the moving average convergence divergence (MACD) indicator, # except the PPO measures percentage difference between two EMAs, while the MACD measures absolute (dollar) difference. signal_line = builder.exponential_moving_average(ta.ppo_12_26, 9) ta_picked[ 'ppo_12_26_signal'] = signal_line # Relationship with signal line ta_picked['ppo_12_26_diff_signal'] = ( ta.ppo_12_26 - signal_line).pct_change() # Relationship with signal line ta_picked['ppo_12_26_pct'] = ta.ppo_12_26.pct_change( ) # Information about slope # ADI Accumulation/distribution is a cumulative indicator that uses volume and price to assess whether # a stock is being accumulated or distributed. # The accumulation/distribution measure seeks to identify divergences between the stock price and volume flow. # This provides insight into how strong a trend is. If the price is rising but the indicator is falling # this indicates that buying or accumulation volume may not be enough to support # the price rise and a price decline could be forthcoming. # ==> IDEA: if we can fit a line to the price y1 = m1X+q1 and a line to ADI y2=m2X+q2 then we can identify # divergences by simply looking at the sign of M. # Another insight would be given by the slope (ie pct_change) ta_picked['adi_pct'] = ta.adi.pct_change() ta_picked['adi_close_convergence'] = convergence_between_series( ta.adi, ohlcv.close, 3) # RSI goes from 0 to 100, values <= 20 mean BUY, while values >= 80 mean SELL. # Dividing it by 100 to get a floating point feature, makes no sense to pct_change it ta_picked['rsi_14_div100'] = ta.rsi_14 / 100 # The Money Flow Index (MFI) is a technical indicator that generates overbought or oversold # signals using both prices and volume data. The oscillator moves between 0 and 100. # An MFI reading above 80 is considered overbought and an MFI reading below 20 is considered oversold, # although levels of 90 and 10 are also used as thresholds. # A divergence between the indicator and price is noteworthy. For example, if the indicator is rising while # the price is falling or flat, the price could start rising. ta_picked['mfi_14_div100'] = ta.mfi_14 / 100 # The Chande momentum oscillator is a technical momentum indicator similar to other momentum indicators # such as Wilder’s Relative Strength Index (Wilder’s RSI) and the Stochastic Oscillator. # It measures momentum on both up and down days and does not smooth results, triggering more frequent # oversold and overbought penetrations. The indicator oscillates between +100 and -100. # Many technical traders add a 10-period moving average to this oscillator to act as a signal line. # The oscillator generates a bullish signal when it crosses above the moving average and a # bearish signal when it drops below the moving average. ta_picked['cmo_14_div100'] = ta.cmo_14 / 100 signal_line = builder.simple_moving_average(ta.cmo_14, 10) ta_picked['cmo_14_signal'] = signal_line ta_picked['cmo_14_diff_signal'] = (ta.cmo_14 - signal_line) / 100 # On-balance volume (OBV) is a technical trading momentum indicator that uses volume flow to predict changes in stock price. # Eventually, volume drives the price upward. At that point, larger investors begin to sell, and smaller investors begin buying. # Despite being plotted on a price chart and measured numerically, # the actual individual quantitative value of OBV is not relevant. # The indicator itself is cumulative, while the time interval remains fixed by a dedicated starting point, # meaning the real number value of OBV arbitrarily depends on the start date. # Instead, traders and analysts look to the nature of OBV movements over time; # the slope of the OBV line carries all of the weight of analysis. => We want percent change ta_picked['obv_pct'] = ta.obv.pct_change() ta_picked['obv_mean3_pct'] = ta.obv.rolling(3).mean().pct_change() # Strong rallies in price should see the force index rise. # During pullbacks and sideways movements, the force index will often fall because the volume # and/or the size of the price moves gets smaller. # => Encoding the percent variation could be a good idea ta_picked['fi_13_pct'] = ta.fi_13.pct_change() ta_picked['fi_50_pct'] = ta.fi_50.pct_change() # The Aroon Oscillator is a trend-following indicator that uses aspects of the # Aroon Indicator (Aroon Up and Aroon Down) to gauge the strength of a current trend # and the likelihood that it will continue. # It moves between -100 and 100. A high oscillator value is an indication of an uptrend # while a low oscillator value is an indication of a downtrend. ta_picked['ao_14'] = ta.ao_14 / 100 # The average true range (ATR) is a technical analysis indicator that measures market volatility # by decomposing the entire range of an asset price for that period. # ATRP is pct_change of volatility ta_picked['atrp_14'] = ta.atrp_14 # Percentage Volume Oscillator (PVO) is momentum volume oscillator used in technical analysis # to evaluate and measure volume surges and to compare trading volume to the average longer-term volume. # PVO does not analyze price and it is based solely on volume. # It compares fast and slow volume moving averages by showing how short-term volume differs from # the average volume over longer-term. # Since it does not care a trend's factor in its calculation (only volume data are used) # this technical indicator cannot be used alone to predict changes in a trend. ta_picked['pvo_12_26'] = ta.pvo_12_26 # IGNORED: tsi, wd, adx, #lagged_stats = pd.concat([ohlcv_stats] + [builder.make_lagged(ohlcv_stats, i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner') # Build the dataframe with base features # lagged_close = pd.concat([ohlcv.close.pct_change()] + [builder.make_lagged(ohlcv.close.pct_change(), i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner') # lagged_close.columns = ['close_pct'] + ['close_pct_lag-{}'.format(i) for i in range(1, W +1)] ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']].pct_change() ohlc.columns = ['{}_pct'.format(c) for c in ohlcv.columns] lagged_ohlc_pct = pd.concat( [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)], axis='columns', verify_integrity=True, sort=True, join='inner') _time = pd.DataFrame(index=ohlcv.index) _time['day_of_year'] = ohlcv.index.dayofyear _time['day_of_week'] = ohlcv.index.dayofweek ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']] x_space = np.linspace(0, ohlc.index.size, ohlc.index.size) _splines = pd.DataFrame(index=ohlcv.index) # Highly correlated between themselves, no use # _splines['open_spl'] = get_spline(ohlc.open, 0) # _splines['high_spl'] = get_spline(ohlc.high, 0) # _splines['low_spl'] = get_spline(ohlc.low, 0) # _splines['close_spl'] = get_spline(ohlc.close, 0) _splines['open_spl_d1'] = builder.get_spline(ohlc.open, 1) _splines['high_spl_d1'] = builder.get_spline(ohlc.high, 1) _splines['low_spl_d1'] = builder.get_spline(ohlc.low, 1) _splines['close_spl_d1'] = builder.get_spline(ohlc.close, 1) _splines['open_spl_d2'] = builder.get_spline(ohlc.open, 2) _splines['high_spl_d2'] = builder.get_spline(ohlc.high, 2) _splines['low_spl_d2'] = builder.get_spline(ohlc.low, 2) _splines['close_spl_d2'] = builder.get_spline(ohlc.close, 2) _patterns = builder.get_talib_patterns(ohlcv) _new_features = pd.DataFrame(index=ohlcv.index) _new_features['candlestick_patterns_mean'] = _patterns.mean(axis=1) _new_features['candlestick_patterns_sum'] = _patterns.sum(axis=1) # WE LIKE THESE TWO!!!! _new_features['close_volatility_7d'] = ohlcv.close.pct_change( ).rolling(7).std(ddof=0) _new_features['close_volatility_30d'] = ohlcv.close.pct_change( ).rolling(30).std(ddof=0) # # Candle body size variation, for example _new_features['close_open_pct'] = ( ohlcv.close - ohlcv.open ).pct_change() # Change in body of the candle (> 0 if candle is green) _new_features['high_close_dist_pct'] = ( ohlcv.high - ohlcv.close ).pct_change( ) # Change in wick size of the candle, shorter wick should be bullish _new_features['low_close_dist_pct'] = ( ohlcv.close - ohlcv.low ).pct_change( ) # Change in shadow size of the candle, this increasing would indicate support (maybe a bounce) _new_features['high_low_dist_pct'] = ( ohlcv.high - ohlcv.low ).pct_change( ) # Change in total candle size, smaller candles stands for low volatility for d in [3, 7, 30]: ohlcv_d[d].columns = ['close', 'high', 'low', 'open', 'volume'] _new_features['close_open_pct_d{}'.format(d)] = ( ohlcv_d[d].close - ohlcv_d[d].open).pct_change() _new_features['high_close_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].high - ohlcv_d[d].close).pct_change() _new_features['low_close_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].close - ohlcv_d[d].low).pct_change() _new_features['high_low_dist_pct_d{}'.format(d)] = ( ohlcv_d[d].high - ohlcv_d[d].low).pct_change() _ta_windowed_features = pd.concat([ v.rename(columns={c: '{}_ta{}d'.format(c, d) for c in v.columns}) for d, v in ta_d.items() ], axis=1) # Add lagged features to the dataframe ta.columns = ['{}_ta1d'.format(c) for c in ta.columns] feature_groups = [ _new_features, _splines, lagged_ohlc_pct, cm_picked, ta_picked, _ta_windowed_features, ta ] improved_df = pd.concat(feature_groups, axis='columns', verify_integrity=True, sort=True, join='inner') # Drop the first 30 rows improved_df = improved_df[30:] # Drop columns whose values are all nan or inf with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily improved_df = improved_df.dropna(axis='columns', how='all') logger.info('Saving {}'.format(_sym)) for c in improved_df.columns: # session, group, symbol, name, series s = sessionFactory() add_feature(s, 'dbfeaturetest', _sym, c, improved_df[c]) s.commit() #save_symbol_dataset(dest_index, _sym, improved_df, target=_target) logger.info('Saved {}'.format(_sym))
def generate_adversarial_images(args): # assertions assert args.adversary_to_generate is not None, \ "adversary_to_generate can't be None" assert AdversaryType.has_value(args.adversary_to_generate), \ "\"{}\" adversary_to_generate not defined".format(args.adversary_to_generate) defense_name = None if not args.defenses else args.defenses[0] # defense = get_defense(defense_name, args) data_indices = _get_data_indices(args) data_type = args.data_type if args.data_type == "train" else "valid" dataset = load_dataset(args, data_type, None, data_indices=data_indices) data_loader = get_data_loader( dataset, batchsize=args.batchsize, device=args.device, shuffle=False) model, _, _ = get_model(args, load_checkpoint=True, defense_name=defense_name) adv_params = constants.get_adv_params(args) print('| adv_params:', adv_params) status = None all_inputs = None all_outputs = None all_targets = None bar = progressbar.ProgressBar(len(data_loader)) bar.start() for batch_num, (imgs, targets) in enumerate(data_loader): if args.adversary_to_generate == str(AdversaryType.DEEPFOOL): assert adv_params['learning_rate'] is not None s, r = adversary.deepfool( model, imgs, targets, args.data_params['NUM_CLASSES'], train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter, step_size=adv_params['learning_rate'], batch_size=args.batchsize, labels=dataset.get_classes()) elif args.adversary_to_generate == str(AdversaryType.FGS): s, r = adversary.fgs( model, imgs, targets, train_mode=(args.data_type == 'train'), mode=args.fgs_mode) elif args.adversary_to_generate == str(AdversaryType.IFGS): assert adv_params['learning_rate'] is not None s, r = adversary.ifgs( model, imgs, targets, train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter, step_size=adv_params['learning_rate'], mode=args.fgs_mode) elif args.adversary_to_generate == str(AdversaryType.CWL2): assert args.adv_strength is not None and len(args.adv_strength) == 1 if len(args.crop_frac) == 1: crop_frac = args.crop_frac[0] else: crop_frac = 1.0 s, r = adversary.cw( model, imgs, targets, args.adv_strength[0], 'l2', tv_weight=args.tvm_weight, train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter, drop_rate=args.pixel_drop_rate, crop_frac=crop_frac, kappa=args.margin) elif args.adversary_to_generate == str(AdversaryType.CWLINF): assert args.adv_strength is not None and len(args.adv_strength) == 1 s, r = adversary.cw( model, imgs, targets, args.adv_strength[0], 'linf', bound=args.adv_bound, tv_weight=args.tvm_weight, train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter, drop_rate=args.pixel_drop_rate, crop_frac=args.crop_frac, kappa=args.margin) if status is None: status = s.clone() all_inputs = imgs.clone() all_outputs = imgs + r all_targets = targets.clone() else: status = torch.cat((status, s), 0) all_inputs = torch.cat((all_inputs, imgs), 0) all_outputs = torch.cat((all_outputs, imgs + r), 0) all_targets = torch.cat((all_targets, targets), 0) bar.update(batch_num) print("| computing adversarial stats...") if args.compute_stats: rb, ssim, sc = adversary.compute_stats(all_inputs, all_outputs, status) print('| average robustness = ' + str(rb)) print('| average SSIM = ' + str(ssim)) print('| success rate = ' + str(sc)) # Unnormalize before saving unnormalize = Unnormalize(args.data_params['MEAN_STD']['MEAN'], args.data_params['MEAN_STD']['STD']) all_inputs = unnormalize(all_inputs) all_outputs = unnormalize(all_outputs) # save output output_file = get_adversarial_file_path( args, args.adversarial_root, defense_name, adv_params, data_indices['end_idx'], start_idx=data_indices['start_idx'], with_defense=False) print("| Saving adversarial data at " + output_file) if not os.path.isdir(args.adversarial_root): os.makedirs(args.adversarial_root) torch.save({'status': status, 'all_inputs': all_inputs, 'all_outputs': all_outputs, 'all_targets': all_targets}, output_file)
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): # Define log file path for this run log_file = './results/{}_{}_{}/model_build.log'.format( dataset, pipeline, experiment) os.makedirs('./results/{}_{}_{}'.format(dataset, pipeline, experiment), exist_ok=True) # Setup logging logger.setup(filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model') # Load the dataset index dataset_index = load_dataset(dataset, return_index=True) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) # Parameter grid argument: # - If None, use the pipeline-defined grid # - If string, parse it as JSON # - If dict, use it as-is (do nothing) if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) # Target argument # Determines the target feature name (the system supports different classification targets) # If not supplied, use the pipeline-defined target. current_target = p.TARGET if not use_target else use_target logger.info('Start processing: {} using {} on {} with target {}'.format( experiment, pipeline, dataset, current_target)) reports = ReportCollection(dataset, pipeline, experiment) for _sym, data in dataset_index.items(): try: logger.info('Start processing: {}'.format(_sym)) # ToDo: use lib.dataset.features.load_symbol instead of manually reading csv's features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value # in the pipeline's "Inputing" stage features = features.dropna(axis='columns', how='all').replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] # Split available data in train and test set. # Perform grid search with cross-validation on the training set, # Then ToDo: instantiate a MLStrategy and test the model on the test set, in sliding window fashion X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=test_size) # Log before and after grid search to track execution time # Grid search logic moved to its own method for cleanliness logger.info("Start Grid search") gscv = grid_search(p.estimator, param_grid, X_train, y_train, cv=cv, n_jobs=n_jobs, expanding_window=expanding_window, scoring=scoring) logger.info("End Grid search") labels, predictions = test_model(p.estimator, gscv.best_params_, 30, X_train, y_train, X_test, y_test) report = classification_report(labels, predictions, output_dict=True) # Create a Report instance from the grid search results, and add it to this experiment's collection _report = Report(_sym, current_target, cv) _report.set_close(targets.loc[features.index].close) _report.set_dataset_columns(features.columns) _report.set_train_dataset(X_train, y_train) _report.set_test_dataset(X_test, y_test) _report.set_model(p.estimator) _report.set_params(gscv.best_params_) _report.set_cv(gscv.best_estimator_, gscv.best_score_, gscv.cv_results_) reports.add_report(_report) reports.save() logger.info("--- {} end ---".format(_sym)) except Exception as e: logger.error( "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}" .format(pipeline, dataset, _sym, e)) traceback.print_exc() return reports
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment) if ',' in scoring: scoring = scoring.split(',') # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias if scoring == 'precision': scoring = make_scorer(precision_score, zero_division=1) os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup( filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model' ) index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) experiment_index = {} if n_jobs == 'auto': n_jobs = os.cpu_count() # Load parameter grid argument if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset)) for _sym, data in dataset_index.items(): logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) current_target = p.TARGET if not use_target else use_target # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] features = features.replace([np.inf, -np.inf], np.nan) imputer = SimpleImputer() imputer.fit(features.values) feat_imp_values = imputer.transform(features.values) features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns) X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size) # Summarize distribution logger.info("Start Hyperopt search") if expanding_window: cv = TimeSeriesSplit(n_splits=expanding_window) #cv = sliding_window_split(X_train, 0.1) est = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) est.fit(X_train, y_train) logger.info("End Hyperopt search") # Take the fitted ensemble with tuned hyperparameters clf = est.best_model()['learner'] best_score = est.score(X_train, y_train) best_params = {} # Plot learning curve for the classifier #est = p.estimator #est.set_params(**best_params) _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True) #plt.tight_layout() _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ] #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv) axes[1][0].set_title("{} - ROC (Train)".format(_sym)) plot_roc_curve(clf, X_train, y_train, ax=axes[1][0]) axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1]) axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym)) plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2]) axes[2][0].set_title("{} - ROC (Test)".format(_sym)) plot_roc_curve(clf, X_test, y_test, ax=axes[2][0]) axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1]) axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym)) plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2]) curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym) plt.savefig(curve_path) plt.close() # Test ensemble's performance on training and test sets predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1))) predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2))) report = { 'training_set': { 'features':X_train.shape[1], 'records':X_train.shape[0], 'class_distribution': get_class_distribution(y_train), 'classification_report': train_report, 'accuracy': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'precision': precision_score(y_train, predictions1), 'recall': recall_score(y_train, predictions1), 'f1': f1_score(y_train, predictions1), 'y_true':[y for y in y_train], 'y_pred':[y for y in predictions1] }, 'test_set': { 'features':X_test.shape[1], 'records':X_test.shape[0], 'class_distribution':get_class_distribution(y_test), 'classification_report': test_report, 'accuracy': accuracy_score(y_test, predictions2), 'precision': precision_score(y_test, predictions2), 'mse': mean_squared_error(y_test, predictions2), 'recall': recall_score(y_test, predictions2), 'f1': f1_score(y_test, predictions2), 'y_true': [y for y in y_test], 'y_pred': [y for y in predictions2] } } # If the classifier has a feature_importances attribute, save it in the report feature_importances = None if hasattr(clf, 'feature_importances_'): feature_importances = clf.feature_importances_ elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'): feature_importances = clf.named_steps.c.feature_importances_ if feature_importances is not None: importances = {features.columns[i]: v for i, v in enumerate(feature_importances)} labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])} report['feature_importances'] = labeled if hasattr(clf, 'ranking_'): report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)} if hasattr(clf, 'support_'): report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s] train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()] test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()] logger.info('Model evaluation: \n' '== Training set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ '== Test set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist), report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'], report['training_set']['recall'], report['training_set']['f1'], X_test.shape[1], X_test.shape[0], '\n'.join(test_dist), report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'], report['test_set']['recall'], report['test_set']['f1'] ) ) # Save a pickle dump of the model model_path = '{}{}.p'.format(models_dir, _sym) with open(model_path, 'wb') as f: pickle.dump(clf, f) # Save the model's parameters params_path = '{}{}_parameters.json'.format(models_dir, _sym) with open(params_path, 'w') as f: json.dump(best_params, f, indent=4) # Save the report for this model report_path = '{}{}.json'.format(reports_dir, _sym) with open(report_path, 'w') as f: json.dump(report, f, indent=4) # Update the experiment's index with the new results, and save it experiment_index[_sym] = { 'model':model_path, 'params':params_path, 'report':report_path } with open(experiment_index_file, 'w') as f: json.dump(experiment_index, f, indent=4) logger.info("--- {} end ---".format(_sym)) return experiment_index
def build(source_index, dest_index, W=10): _dataset = load_dataset(source_index, return_index=True) for _sym, entry in _dataset.items(): _df = pd.read_csv(entry['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) _target = pd.read_csv(entry['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) ta = _df[entry['features']['ta']] cm = _df[entry['features']['cm']] # Price history facet (Daily variation of ohlc in last W trading days) ohlc = _df.loc[:, ['open', 'high', 'low', 'close']] ohlc['open'] = STL(ohlc.open).fit().resid ohlc['high'] = STL(ohlc.high).fit().resid ohlc['low'] = STL(ohlc.low).fit().resid ohlc['close'] = STL(ohlc.close).fit().resid ohlc.columns = ['open_resid', 'high_resid', 'low_resid', 'close_resid'] history_facet = pd.concat( [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)], axis='columns', verify_integrity=True, sort=True, join='inner') # Price trend facet (REMA/RSMA, MACD, AO, ADX, WD+ - WD-) trend_facet = ta[[ "rsma_5_20", "rsma_8_15", "rsma_20_50", "rema_5_20", "rema_8_15", "rema_20_50", "macd_12_26", "ao_14", "adx_14", "wd_14" ]] # Volatility facet (CMO, ATRp) volatility_facet = ta[["cmo_14", "atrp_14"]] # Volume facet (Volume pct, PVO, ADI, OBV) volume_facet = pd.concat([ _df.volume.pct_change().replace([np.inf, -np.inf], 0), ta[["pvo_12_26", "adi", "obv"]] ], axis='columns', verify_integrity=True, sort=True, join='inner') # On-chain facet cm_1 = cm.reindex(columns=[ 'adractcnt', 'txtfrvaladjntv', 'isstotntv', 'feetotntv', 'splycur', 'hashrate', 'difficulty', 'txtfrcount' ]).pct_change() cm_2 = cm.reindex(columns=['isscontpctann']) chain_facet = pd.concat([cm_1, cm_2], axis='columns', verify_integrity=True, sort=True, join='inner') # Drop columns whose values are all nan or inf from each facet with pd.option_context('mode.use_inf_as_na', True): # Set option temporarily history_facet = history_facet.dropna(axis='columns', how='all') trend_facet = trend_facet.dropna(axis='columns', how='all') volatility_facet = volatility_facet.dropna(axis='columns', how='all') volume_facet = volume_facet.dropna(axis='columns', how='all') chain_facet = chain_facet.dropna(axis='columns', how='all') improved_df = pd.concat([ history_facet, trend_facet, volatility_facet, volume_facet, chain_facet ], axis='columns', verify_integrity=True, sort=True, join='inner') # Drop the first 30 rows #improved_df = improved_df[30:] # Add symbol to index feature_groups = { 'price_history': [c for c in history_facet.columns], 'trend': [c for c in trend_facet.columns], 'volatility': [c for c in volatility_facet.columns], 'volume': [c for c in volume_facet.columns], 'chain': [c for c in chain_facet.columns], } logger.info('Saving {}'.format(_sym)) save_symbol_dataset(dest_index, _sym, improved_df, feature_groups=feature_groups, target=_target) logger.info('Saved {}'.format(_sym))
def classify_images(args): # assertions assert args.ensemble is None or args.ensemble in ENSEMBLE_TYPE, \ "{} not a supported type. Only supported ensembling are {}".format( args.ensemble, ENSEMBLE_TYPE) if not args.ensemble: assert args.ncrops is None or (len(args.ncrops) == 1 and args.ncrops[0] == 1) if args.defenses is not None: for d in args.defenses: assert DefenseType.has_value(d), \ "\"{}\" defense not defined".format(d) # crops expected for each defense assert (args.ncrops is None or len(args.ncrops) == len( args.defenses)), ("Number of crops for each defense is expected") assert (args.crop_type is None or len(args.crop_type) == len( args.defenses)), ("crop_type for each defense is expected") # assert (len(args.crop_frac) == len(args.defenses)), ( # "crop_frac for each defense is expected") elif args.ncrops is not None: # no crop ensembling when defense is None assert len(args.ncrops) == 1 assert args.crop_frac is not None and len(args.crop_frac) == 1, \ "Only one crop_frac is expected as there is no defense" assert args.crop_type is not None and len(args.crop_type) == 1, \ "Only one crop_type is expected as there is no defense" if args.defenses is None or len(args.defenses) == 0: defenses = [None] else: defenses = args.defenses all_defense_probs = None for idx, defense_name in enumerate(defenses): # initialize dataset defense = get_defense(defense_name, args) # Read preset params for adversary based on args adv_params = constants.get_adv_params(args, idx) print("| adv_params: ", adv_params) # setup crop ncrops = 1 crop_type = None crop_frac = 1.0 if args.ncrops: crop_type = args.crop_type[idx] crop_frac = args.crop_frac[idx] if crop_type == 'sliding': ncrops = 9 else: ncrops = args.ncrops[idx] # Init custom crop function crop = transforms.Crop(crop_type, crop_frac) # initialize dataset dataset = load_dataset(args, 'valid', defense, adv_params, crop) # load model model, _, _ = get_model(args, load_checkpoint=True, defense_name=defense_name) # get crop probabilities for crops for current defense probs, targets = _eval_crops(args, dataset, model, defense, crop, ncrops, crop_type) if all_defense_probs is None: all_defense_probs = torch.zeros(len(defenses), len(dataset), probs.size(2)) # Ensemble crop probabilities if args.ensemble == 'max': probs = torch.max(probs, dim=0)[0] elif args.ensemble == 'avg': # for average ensembling probs = torch.mean(probs, dim=0) else: # for no ensembling assert all_defense_probs.size(0) == 1 probs = probs[0] all_defense_probs[idx, :, :] = probs # free memory dataset = None model = None # Ensemble defense probabilities if args.ensemble == 'max': all_defense_probs = torch.max(all_defense_probs, dim=0)[0] elif args.ensemble == 'avg': # for average ensembling all_defense_probs = torch.mean(all_defense_probs, dim=0) else: # for no ensembling assert all_defense_probs.size(0) == 1 all_defense_probs = all_defense_probs[0] # Calculate top1 and top5 accuracy prec1, prec5 = accuracy(all_defense_probs, targets, topk=(1, 5)) print('=' * 50) print('Results for model={}, attack={}, ensemble_type={} '.format( args.model, args.adversary, args.ensemble)) prec1 = prec1[0] prec5 = prec5[0] print('| classification accuracy @1: %2.5f' % (prec1)) print('| classification accuracy @5: %2.5f' % (prec5)) print('| classification error @1: %2.5f' % (100. - prec1)) print('| classification error @5: %2.5f' % (100. - prec5)) print('| done.')
def build_model(dataset, pipeline, experiment, current_target='class', test_size=0.3): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format( dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format( dataset, pipeline, experiment) scoring = make_scorer(precision_score, zero_division=1, average='micro') os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup(filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model') index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model logger.info('Start experiment: {} using {} on {} with target {}'.format( experiment, pipeline, dataset, current_target)) reports = ReportCollection(dataset, pipeline, experiment) for _sym, data in {'BTC': dataset_index['BTC']}.items(): try: logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna( axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size) all_size = features.shape[0] train_size = int(all_size * (1 - test_size)) features = detabularise( features[[c for c in features.columns if 'close' in c]]) X_train = features.iloc[0:train_size] y_train = target.iloc[0:train_size] X_test = features.iloc[train_size:all_size] y_test = target.iloc[train_size:all_size] # Summarize distribution logger.info("Start Grid search") clf = ShapeletTransformClassifier(time_contract_in_mins=5) clf.fit(X_train, y_train) print('{} Score: {}'.format(_sym, clf.score(X_test, y_test))) pred = clf.predict(X_test) print(classification_report(y_test, pred)) logger.info("End Grid search") logger.info("--- {} end ---".format(_sym)) except Exception as e: logger.error( "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}" .format(pipeline, dataset, _sym, e)) traceback.print_exc() return reports