def data_loader_hook(epoch):
        # Reload data loader for epoch
        if args.preprocessed_epoch_data:
            print('| epoch %d, Loading data:' % epoch)
            for key in {'train', 'valid'}:
                # Load validation data only once
                if key == 'valid' and 'valid' in loaders:
                    break
                loaders[key] = get_data_loader(
                    load_dataset(args, key, defense, epoch=epoch),
                    batchsize=args.batchsize,
                    device=args.device,
                    shuffle=True,
                )
        # if data needs to be loaded only once and is not yet loaded
        elif len(loaders) == 0:
            print('| epoch %d, Loading data:' % epoch)
            for key in {'train', 'valid'}:
                loaders[key] = get_data_loader(
                    load_dataset(args, key, defense),
                    batchsize=args.batchsize,
                    device=args.device,
                    shuffle=True,
                )

        return loaders['train']
def create_faiss_patches(args):

    # load image dataset:
    print('| set up image loader...')
    image_dataset = load_dataset(args,
                                 'train',
                                 None,
                                 with_transformation=False)
    image_dataset.imgs = image_dataset.imgs[:20000]  # we don't need all images

    # gather image patches:
    print('| gather image patches...')
    patches = gather_patches(
        image_dataset,
        args.num_patches,
        args.patch_size,
        patch_transform=None,
    )

    # build faiss index:
    print('| training faiss index...')
    faiss_index, sub_index = index_patches(patches, pca_dims=args.pca_dims)
    # NOTE: Keep reference to sub_index to prevent it from being GC'ed

    # save faiss index and patches:
    print('| writing faiss index to %s' % args.index_file)
    faiss.write_index(faiss_index, args.index_file)
    with open(args.patches_file, 'wb') as fwrite:
        print('| writing patches to %s' % args.patches_file)
        pickle.dump(patches, fwrite, pickle.HIGHEST_PROTOCOL)
    print('| done.')
Example #3
0
 def __init__(self, summary_writer, settings, data_dir=None, manual=False):
     """ Setup the values and variables for Training.
     Args:
       summary_writer: tf.summary.SummaryWriter object to write summaries for Tensorboard.
       settings: settings object for fetching data from config files.
       data_dir (default: None): path where the data downloaded should be stored / accessed.
       manual (default: False): boolean to represent if data_dir is a manual dir.
 """
     self.settings = settings
     self.summary_writer = summary_writer
     self.iterations = self.settings["iterations"]
     dataset_args = self.settings["dataset"]
     if not manual:
         self.dataset = dataset.load_dataset(
             dataset_args["name"],
             dataset.scale_down(method=dataset_args["scale_method"],
                                dimension=dataset_args["hr_dimension"]),
             batch_size=settings["batch_size"],
             data_dir=data_dir)
     else:
         self.dataset = dataset.load_dataset_directory(
             dataset_args["name"],
             data_dir,
             dataset.scale_down(method=dataset_args["scale_method"],
                                dimension=dataset_args["hr_dimension"]),
             batch_size=settings["batch_size"])
Example #4
0
def generate_tf_record(data_dir,
                       raw_data=False,
                       tfrecord_path="serialized_dataset",
                       num_shards=8):

    teacher_sett = settings.Settings(use_student_settings=False)
    student_sett = settings.Settings(use_student_settings=True)
    dataset_args = teacher_sett["dataset"]
    if dataset_args["name"].lower().strip() == "div2k":
        assert len(data_dir) == 2
        ds = dataset.load_div2k_dataset(data_dir[0],
                                        data_dir[1],
                                        student_sett["hr_size"],
                                        shuffle=True)
    elif raw_data:
        ds = dataset.load_dataset_directory(
            dataset_args["name"], data_dir,
            dataset.scale_down(method=dataset_args["scale_method"],
                               size=student_sett["hr_size"]))
    else:
        ds = dataset.load_dataset(dataset_args["name"],
                                  dataset.scale_down(
                                      method=dataset_args["scale_method"],
                                      size=student_sett["hr_size"]),
                                  data_dir=data_dir)
    to_tfrecord(ds, tfrecord_path, num_shards)
Example #5
0
 def __init__(self,
              summary_writer,
              summary_writer_2,
              settings,
              model_dir="",
              data_dir=None,
              manual=False,
              strategy=None):
     """ Setup the values and variables for Training.
     Args:
       summary_writer: tf.summary.SummaryWriter object to write summaries for Tensorboard.
       settings: settings object for fetching data from config files.
       data_dir (default: None): path where the data downloaded should be stored / accessed.
       manual (default: False): boolean to represent if data_dir is a manual dir.
 """
     self.settings = settings
     self.model_dir = model_dir
     self.summary_writer = summary_writer
     self.summary_writer_2 = summary_writer_2
     self.strategy = strategy
     dataset_args = self.settings["dataset"]
     augment_dataset = dataset.augment_image(saturation=None)
     self.batch_size = self.settings["batch_size"]
     hr_size = tf.convert_to_tensor(
         [dataset_args["hr_dimension"], dataset_args["hr_dimension"], 3])
     lr_size = tf.cast(hr_size, tf.float32) * \
         tf.convert_to_tensor([1 / 4, 1 / 4, 1], tf.float32)
     lr_size = tf.cast(lr_size, tf.int32)
     if isinstance(strategy, tf.distribute.Strategy):
         self.dataset = (dataset.load_tfrecord_dataset(
             tfrecord_path=data_dir, lr_size=lr_size,
             hr_size=hr_size).repeat().map(augment_dataset).batch(
                 self.batch_size, drop_remainder=True))
         self.dataset = iter(
             strategy.experimental_distribute_dataset(self.dataset))
     else:
         if not manual:
             self.dataset = iter(
                 dataset.load_dataset(
                     dataset_args["name"],
                     dataset.scale_down(
                         method=dataset_args["scale_method"],
                         dimension=dataset_args["hr_dimension"]),
                     batch_size=settings["batch_size"],
                     data_dir=data_dir,
                     augment=True,
                     shuffle=True))
         else:
             self.dataset = iter(
                 dataset.load_dataset_directory(
                     dataset_args["name"],
                     data_dir,
                     dataset.scale_down(
                         method=dataset_args["scale_method"],
                         dimension=dataset_args["hr_dimension"]),
                     batch_size=settings["batch_size"],
                     augment=True,
                     shuffle=True))
Example #6
0
def _load_partial_dataset(args, data_type, defense, adv_params):
    start_idx, end_idx = _get_start_end_index(args)
    data_indices = {'start_idx': start_idx, 'end_idx': end_idx}
    dataset = load_dataset(args,
                           data_type,
                           defense,
                           adv_params,
                           data_indices=data_indices)
    return dataset
Example #7
0
def generate_transformed_images(args):

    # Only runs one method at a time
    assert args.operation is not None, \
        "operation to run can't be None"
    assert OperationType.has_value(args.operation), \
        "\"{}\" operation not defined".format(args.operation)

    assert args.defenses is not None, "Defenses can't be None"
    assert not args.preprocessed_data, \
        "Trying to apply transformations on already transformed images"

    if args.operation == str(OperationType.TRANSFORM_ADVERSARIAL):
        for idx, defense_name in enumerate(args.defenses):
            defense = get_defense(defense_name, args)
            adv_params = constants.get_adv_params(args, idx)
            print("| adv_params: ", adv_params)
            dataset = _load_partial_dataset(args, 'valid', defense, adv_params)

            if args.data_batches is None:
                transformation_on_adv(args, dataset, defense_name, adv_params)
            else:
                for i in range(args.data_batches):
                    transformation_on_adv(args,
                                          dataset,
                                          defense_name,
                                          adv_params,
                                          data_batch_idx=i)

    elif args.operation == str(OperationType.CAT_DATA):
        for idx, defense_name in enumerate(args.defenses):
            adv_params = constants.get_adv_params(args, idx)
            print("| adv_params: ", adv_params)
            if args.data_batches is None:
                concatenate_data(args, defense_name, adv_params)
            else:
                for i in range(args.data_batches):
                    concatenate_data(args,
                                     defense_name,
                                     adv_params,
                                     data_batch_idx=i)

    elif args.operation == str(OperationType.TRANSFORM_RAW):
        start_class_idx = args.partition * args.partition_size
        end_class_idx = (args.partition + 1) * args.partition_size
        class_indices = range(start_class_idx, end_class_idx)
        for defense_name in args.defenses:
            defense = get_defense(defense_name, args)
            data_type = args.data_type if args.data_type == "train" else "valid"
            dataset = load_dataset(args,
                                   data_type,
                                   defense,
                                   class_indices=class_indices)
            transformation_on_raw(args, dataset, defense_name)
Example #8
0
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)

    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)
        ohlcv = _df[entry['features']['ohlcv']]
        ta = _df[entry['features']['ta']]

        # Build the dataframe with base features
        ohlc = ohlcv[['open', 'high', 'low', 'close']]
        lagged_ohlc = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')
        # Add lagged features to the dataframe
        atsa_df = pd.concat([lagged_ohlc, ta],
                            axis='columns',
                            verify_integrity=True,
                            sort=True,
                            join='inner')

        # Drop the first 30 rows
        #atsa_df = atsa_df[30:]

        # decompose_dataframe_features('all_merged', _sym+'_improved', unlagged_df)
        # Add symbol to index
        logger.info('Saving {}'.format(_sym))
        save_symbol_dataset(dest_index, _sym, atsa_df, target=_target)
        logger.info('Saved {}'.format(_sym))
def create_faiss_patches(args):

    # load image dataset:
    print('| set up image loader...')
    image_dataset = load_dataset(args, 'train', None, with_transformation=True)
    image_dataset.imgs = image_dataset.imgs[:20000]  # we don't need all images

    # gather image patches:
    print('| gather image patches...')
    patches = gather_patches(
        image_dataset,
        args.num_patches,
        args.quilting_patch_size,
        patch_transform=None,
    )

    # build faiss index:
    print('| training faiss index...')
    index_patches(patches, args.index_file, pca_dims=args.pca_dims)

    # save patches:
    with open(args.patches_file, 'wb') as fwrite:
        print('| writing patches to %s' % args.patches_file)
        pickle.dump(patches, fwrite, pickle.HIGHEST_PROTOCOL)
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)
    sessionFactory = connect('test_features')
    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)
        ohlcv = _df[entry['features']['ohlcv']]

        ohlcv_d = {
            d: _df[entry['features']['ohlcv_{}d'.format(d)]]
            for d in [3, 7, 30]
        }
        ta_d = {
            d: _df[entry['features']['ta_{}d'.format(d)]]
            for d in [3, 7, 30]
        }

        ta = _df[entry['features']['ta']]
        cm = _df[entry['features']['cm']]

        cm_picked = pd.DataFrame(index=ohlcv.index)
        if 'adractcnt' in cm.columns:
            cm_picked['adractcnt_pct'] = cm.adractcnt.pct_change()
            # cm_picked['adractcnt_mean3_pct'] = cm.adractcnt.rolling(3).mean().pct_change()
            # cm_picked['adractcnt_mean7_pct'] = cm.adractcnt.rolling(7).mean().pct_change()
        # if 'splycur' in cm.columns: ## Correlated with volume and close
        #     cm_picked['vol_supply'] = ohlcv.volume / cm.splycur # Ratio between transacted volume and total supply (mined)
        if 'txtfrvaladjntv' in cm.columns and 'isstotntv' in cm.columns and 'feetotntv' in cm.columns:
            # I want to represent miners earnings (fees + issued coins) vs amount transacted in that interval
            cm_picked['earned_vs_transacted'] = (
                cm.isstotntv + cm.feetotntv) / cm.txtfrvaladjntv
        if 'isstotntv' in cm.columns:
            # isstotntv is total number of coins mined in the time interval
            # splycur is total number of coins mined (all time)
            total_mined = cm.isstotntv.rolling(
                365, min_periods=7).sum()  # total mined in a year
            cm_picked['isstot365_isstot1_pct'] = (cm.isstotntv /
                                                  total_mined).pct_change()
        if 'splycur' in cm.columns and 'isstotntv' in cm.columns:
            cm_picked['splycur_isstot1_pct'] = (cm.isstotntv /
                                                cm.splycur).pct_change()
        if 'hashrate' in cm.columns:
            #cm_picked['hashrate_mean3_pct'] = cm.hashrate.rolling(3).mean().pct_change()
            #cm_picked['hashrate_mean7_pct'] = cm.hashrate.rolling(7).mean().pct_change()
            cm_picked['hashrate_pct'] = cm.hashrate.pct_change()
        if 'roi30d' in cm.columns:
            cm_picked['roi30d'] = cm.roi30d
        if 'isstotntv' in cm.columns:
            cm_picked['isstotntv_pct'] = cm.isstotntv.pct_change()
        if 'feetotntv' in cm.columns:
            cm_picked['feetotntv_pct'] = cm.feetotntv.pct_change()
        if 'txtfrcount' in cm.columns:
            cm_picked['txtfrcount_pct'] = cm.txtfrcount.pct_change()
            #cm_picked['txtfrcount_volume'] = cm.txtfrcount.pct_change()
        if 'vtydayret30d' in cm.columns:
            cm_picked['vtydayret30d'] = cm.vtydayret30d
        if 'isscontpctann' in cm.columns:
            cm_picked['isscontpctann'] = cm.isscontpctann

        ta_picked = pd.DataFrame(index=ta.index)
        # REMA / RSMA are already used and well-estabilished in ATSA,
        # I'm taking the pct change since i want to encode the relative movement of the ema's not their positions
        # ta_picked['rema_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rema_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rema_20_50_pct'] = ta.rema_20_50.pct_change()
        # ta_picked['rsma_5_20_pct'] = ta.rema_5_20.pct_change()
        ta_picked['rsma_8_15_pct'] = ta.rema_8_15.pct_change()
        # ta_picked['rsma_20_50_pct'] = ta.rema_20_50.pct_change()

        # Stoch is a momentum indicator comparing a particular closing price of a security to a range of its prices
        # over a certain period of time.
        # The sensitivity of the oscillator to market movements is reducible by adjusting that time period or
        # by taking a moving average of the result.
        # It is used to generate overbought and oversold trading signals, utilizing a 0-100 bounded range of values.
        # IDEA => decrease sensitivity by 3-mean and divide by 100 to get fp values
        ta_picked['stoch_14_mean3_div100'] = ta.stoch_14.rolling(
            3).mean() / 100

        #Moving Average Convergence Divergence (MACD) is a trend-following momentum indicator that shows
        # the relationship between two moving averages of a security’s price.
        # The MACD is calculated by subtracting the 26-period Exponential Moving Average (EMA) from the 12-period EMA.
        #  A nine-day EMA of the MACD called the "signal line," is then plotted on top of the MACD line,
        #  which can function as a trigger for buy and sell signals.
        #  Traders may buy the security when the MACD crosses above its signal line and sell - or short - the security
        #  when the MACD crosses below the signal line.
        #  Moving Average Convergence Divergence (MACD) indicators can be interpreted in several ways,
        #  but the more common methods are crossovers, divergences, and rapid rises/falls.
        signal_line = builder.exponential_moving_average(ta.macd_12_26, 9)
        ta_picked[
            'macd_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['macd_12_26_diff_signal'] = (
            ta.macd_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['macd_12_26_pct'] = ta.macd_12_26.pct_change(
        )  # Information about slope

        # PPO is identical to the moving average convergence divergence (MACD) indicator,
        # except the PPO measures percentage difference between two EMAs, while the MACD measures absolute (dollar) difference.
        signal_line = builder.exponential_moving_average(ta.ppo_12_26, 9)
        ta_picked[
            'ppo_12_26_signal'] = signal_line  # Relationship with signal line
        ta_picked['ppo_12_26_diff_signal'] = (
            ta.ppo_12_26 -
            signal_line).pct_change()  # Relationship with signal line
        ta_picked['ppo_12_26_pct'] = ta.ppo_12_26.pct_change(
        )  # Information about slope

        # ADI Accumulation/distribution is a cumulative indicator that uses volume and price to assess whether
        # a stock is being accumulated or distributed.
        # The accumulation/distribution measure seeks to identify divergences between the stock price and volume flow.
        # This provides insight into how strong a trend is. If the price is rising but the indicator is falling
        # this indicates that buying or accumulation volume may not be enough to support
        # the price rise and a price decline could be forthcoming.
        # ==> IDEA: if we can fit a line to the price y1 = m1X+q1 and a line to ADI y2=m2X+q2 then we can identify
        #           divergences by simply looking at the sign of M.
        #           Another insight would be given by the slope (ie pct_change)
        ta_picked['adi_pct'] = ta.adi.pct_change()
        ta_picked['adi_close_convergence'] = convergence_between_series(
            ta.adi, ohlcv.close, 3)

        # RSI goes from 0 to 100, values <= 20 mean BUY, while values >= 80 mean SELL.
        # Dividing it by 100 to get a floating point feature, makes no sense to pct_change it
        ta_picked['rsi_14_div100'] = ta.rsi_14 / 100

        # The Money Flow Index (MFI) is a technical indicator that generates overbought or oversold
        #   signals using both prices and volume data. The oscillator moves between 0 and 100.
        # An MFI reading above 80 is considered overbought and an MFI reading below 20 is considered oversold,
        #   although levels of 90 and 10 are also used as thresholds.
        # A divergence between the indicator and price is noteworthy. For example, if the indicator is rising while
        #   the price is falling or flat, the price could start rising.
        ta_picked['mfi_14_div100'] = ta.mfi_14 / 100

        # The Chande momentum oscillator is a technical momentum indicator similar to other momentum indicators
        #   such as Wilder’s Relative Strength Index (Wilder’s RSI) and the Stochastic Oscillator.
        #   It measures momentum on both up and down days and does not smooth results, triggering more frequent
        #   oversold and overbought penetrations. The indicator oscillates between +100 and -100.
        # Many technical traders add a 10-period moving average to this oscillator to act as a signal line.
        #   The oscillator generates a bullish signal when it crosses above the moving average and a
        #   bearish signal when it drops below the moving average.
        ta_picked['cmo_14_div100'] = ta.cmo_14 / 100
        signal_line = builder.simple_moving_average(ta.cmo_14, 10)
        ta_picked['cmo_14_signal'] = signal_line
        ta_picked['cmo_14_diff_signal'] = (ta.cmo_14 - signal_line) / 100

        # On-balance volume (OBV) is a technical trading momentum indicator that uses volume flow to predict changes in stock price.
        # Eventually, volume drives the price upward. At that point, larger investors begin to sell, and smaller investors begin buying.
        # Despite being plotted on a price chart and measured numerically,
        # the actual individual quantitative value of OBV is not relevant.
        # The indicator itself is cumulative, while the time interval remains fixed by a dedicated starting point,
        # meaning the real number value of OBV arbitrarily depends on the start date.
        # Instead, traders and analysts look to the nature of OBV movements over time;
        # the slope of the OBV line carries all of the weight of analysis. => We want percent change
        ta_picked['obv_pct'] = ta.obv.pct_change()
        ta_picked['obv_mean3_pct'] = ta.obv.rolling(3).mean().pct_change()

        # Strong rallies in price should see the force index rise.
        # During pullbacks and sideways movements, the force index will often fall because the volume
        # and/or the size of the price moves gets smaller.
        # => Encoding the percent variation could be a good idea
        ta_picked['fi_13_pct'] = ta.fi_13.pct_change()
        ta_picked['fi_50_pct'] = ta.fi_50.pct_change()

        # The Aroon Oscillator is a trend-following indicator that uses aspects of the
        # Aroon Indicator (Aroon Up and Aroon Down) to gauge the strength of a current trend
        # and the likelihood that it will continue.
        # It moves between -100 and 100. A high oscillator value is an indication of an uptrend
        # while a low oscillator value is an indication of a downtrend.
        ta_picked['ao_14'] = ta.ao_14 / 100

        # The average true range (ATR) is a technical analysis indicator that measures market volatility
        #   by decomposing the entire range of an asset price for that period.
        # ATRP is pct_change of volatility
        ta_picked['atrp_14'] = ta.atrp_14

        # Percentage Volume Oscillator (PVO) is momentum volume oscillator used in technical analysis
        #   to evaluate and measure volume surges and to compare trading volume to the average longer-term volume.
        # PVO does not analyze price and it is based solely on volume.
        #  It compares fast and slow volume moving averages by showing how short-term volume differs from
        #  the average volume over longer-term.
        #  Since it does not care a trend's factor in its calculation (only volume data are used)
        #  this technical indicator cannot be used alone to predict changes in a trend.
        ta_picked['pvo_12_26'] = ta.pvo_12_26

        # IGNORED: tsi, wd, adx,

        #lagged_stats = pd.concat([ohlcv_stats] + [builder.make_lagged(ohlcv_stats, i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')

        # Build the dataframe with base features
        # lagged_close = pd.concat([ohlcv.close.pct_change()] + [builder.make_lagged(ohlcv.close.pct_change(), i) for i in range(1,10+1)], axis='columns', verify_integrity=True, sort=True, join='inner')
        # lagged_close.columns = ['close_pct'] + ['close_pct_lag-{}'.format(i) for i in range(1, W +1)]

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']].pct_change()
        ohlc.columns = ['{}_pct'.format(c) for c in ohlcv.columns]
        lagged_ohlc_pct = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')

        _time = pd.DataFrame(index=ohlcv.index)
        _time['day_of_year'] = ohlcv.index.dayofyear
        _time['day_of_week'] = ohlcv.index.dayofweek

        ohlc = ohlcv[['open', 'high', 'low', 'close', 'volume']]
        x_space = np.linspace(0, ohlc.index.size, ohlc.index.size)
        _splines = pd.DataFrame(index=ohlcv.index)

        # Highly correlated between themselves, no use
        # _splines['open_spl'] = get_spline(ohlc.open, 0)
        # _splines['high_spl'] = get_spline(ohlc.high, 0)
        # _splines['low_spl'] = get_spline(ohlc.low, 0)
        # _splines['close_spl'] = get_spline(ohlc.close, 0)

        _splines['open_spl_d1'] = builder.get_spline(ohlc.open, 1)
        _splines['high_spl_d1'] = builder.get_spline(ohlc.high, 1)
        _splines['low_spl_d1'] = builder.get_spline(ohlc.low, 1)
        _splines['close_spl_d1'] = builder.get_spline(ohlc.close, 1)

        _splines['open_spl_d2'] = builder.get_spline(ohlc.open, 2)
        _splines['high_spl_d2'] = builder.get_spline(ohlc.high, 2)
        _splines['low_spl_d2'] = builder.get_spline(ohlc.low, 2)
        _splines['close_spl_d2'] = builder.get_spline(ohlc.close, 2)

        _patterns = builder.get_talib_patterns(ohlcv)
        _new_features = pd.DataFrame(index=ohlcv.index)
        _new_features['candlestick_patterns_mean'] = _patterns.mean(axis=1)
        _new_features['candlestick_patterns_sum'] = _patterns.sum(axis=1)
        # WE LIKE THESE TWO!!!!
        _new_features['close_volatility_7d'] = ohlcv.close.pct_change(
        ).rolling(7).std(ddof=0)
        _new_features['close_volatility_30d'] = ohlcv.close.pct_change(
        ).rolling(30).std(ddof=0)
        #
        # Candle body size variation, for example
        _new_features['close_open_pct'] = (
            ohlcv.close - ohlcv.open
        ).pct_change()  # Change in body of the candle (> 0 if candle is green)
        _new_features['high_close_dist_pct'] = (
            ohlcv.high - ohlcv.close
        ).pct_change(
        )  # Change in wick size of the candle, shorter wick should be bullish
        _new_features['low_close_dist_pct'] = (
            ohlcv.close - ohlcv.low
        ).pct_change(
        )  # Change in shadow size of the candle, this increasing would indicate support (maybe a bounce)
        _new_features['high_low_dist_pct'] = (
            ohlcv.high - ohlcv.low
        ).pct_change(
        )  # Change in total candle size, smaller candles stands for low volatility

        for d in [3, 7, 30]:
            ohlcv_d[d].columns = ['close', 'high', 'low', 'open', 'volume']
            _new_features['close_open_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].open).pct_change()
            _new_features['high_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].close).pct_change()
            _new_features['low_close_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].close - ohlcv_d[d].low).pct_change()
            _new_features['high_low_dist_pct_d{}'.format(d)] = (
                ohlcv_d[d].high - ohlcv_d[d].low).pct_change()

        _ta_windowed_features = pd.concat([
            v.rename(columns={c: '{}_ta{}d'.format(c, d)
                              for c in v.columns}) for d, v in ta_d.items()
        ],
                                          axis=1)
        # Add lagged features to the dataframe
        ta.columns = ['{}_ta1d'.format(c) for c in ta.columns]
        feature_groups = [
            _new_features, _splines, lagged_ohlc_pct, cm_picked, ta_picked,
            _ta_windowed_features, ta
        ]

        improved_df = pd.concat(feature_groups,
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')

        # Drop the first 30 rows
        improved_df = improved_df[30:]
        # Drop columns whose values are all nan or inf
        with pd.option_context('mode.use_inf_as_na',
                               True):  # Set option temporarily
            improved_df = improved_df.dropna(axis='columns', how='all')
        logger.info('Saving {}'.format(_sym))
        for c in improved_df.columns:
            # session, group, symbol, name, series
            s = sessionFactory()
            add_feature(s, 'dbfeaturetest', _sym, c, improved_df[c])
            s.commit()
        #save_symbol_dataset(dest_index, _sym, improved_df, target=_target)
        logger.info('Saved {}'.format(_sym))
def generate_adversarial_images(args):
    # assertions
    assert args.adversary_to_generate is not None, \
        "adversary_to_generate can't be None"
    assert AdversaryType.has_value(args.adversary_to_generate), \
        "\"{}\" adversary_to_generate not defined".format(args.adversary_to_generate)

    defense_name = None if not args.defenses else args.defenses[0]
    # defense = get_defense(defense_name, args)
    data_indices = _get_data_indices(args)
    data_type = args.data_type if args.data_type == "train" else "valid"
    dataset = load_dataset(args, data_type, None, data_indices=data_indices)
    data_loader = get_data_loader(
        dataset,
        batchsize=args.batchsize,
        device=args.device,
        shuffle=False)

    model, _, _ = get_model(args, load_checkpoint=True, defense_name=defense_name)

    adv_params = constants.get_adv_params(args)
    print('| adv_params:', adv_params)
    status = None
    all_inputs = None
    all_outputs = None
    all_targets = None
    bar = progressbar.ProgressBar(len(data_loader))
    bar.start()
    for batch_num, (imgs, targets) in enumerate(data_loader):
        if args.adversary_to_generate == str(AdversaryType.DEEPFOOL):
            assert adv_params['learning_rate'] is not None
            s, r = adversary.deepfool(
                model, imgs, targets, args.data_params['NUM_CLASSES'],
                train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter,
                step_size=adv_params['learning_rate'], batch_size=args.batchsize,
                labels=dataset.get_classes())
        elif args.adversary_to_generate == str(AdversaryType.FGS):
            s, r = adversary.fgs(
                model, imgs, targets, train_mode=(args.data_type == 'train'),
                mode=args.fgs_mode)
        elif args.adversary_to_generate == str(AdversaryType.IFGS):
            assert adv_params['learning_rate'] is not None
            s, r = adversary.ifgs(
                model, imgs, targets,
                train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter,
                step_size=adv_params['learning_rate'], mode=args.fgs_mode)
        elif args.adversary_to_generate == str(AdversaryType.CWL2):
            assert args.adv_strength is not None and len(args.adv_strength) == 1
            if len(args.crop_frac) == 1:
                crop_frac = args.crop_frac[0]
            else:
                crop_frac = 1.0
            s, r = adversary.cw(
                model, imgs, targets, args.adv_strength[0], 'l2',
                tv_weight=args.tvm_weight,
                train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter,
                drop_rate=args.pixel_drop_rate, crop_frac=crop_frac,
                kappa=args.margin)
        elif args.adversary_to_generate == str(AdversaryType.CWLINF):
            assert args.adv_strength is not None and len(args.adv_strength) == 1
            s, r = adversary.cw(
                model, imgs, targets, args.adv_strength[0], 'linf',
                bound=args.adv_bound,
                tv_weight=args.tvm_weight,
                train_mode=(args.data_type == 'train'), max_iter=args.max_adv_iter,
                drop_rate=args.pixel_drop_rate, crop_frac=args.crop_frac,
                kappa=args.margin)

        if status is None:
            status = s.clone()
            all_inputs = imgs.clone()
            all_outputs = imgs + r
            all_targets = targets.clone()
        else:
            status = torch.cat((status, s), 0)
            all_inputs = torch.cat((all_inputs, imgs), 0)
            all_outputs = torch.cat((all_outputs, imgs + r), 0)
            all_targets = torch.cat((all_targets, targets), 0)
        bar.update(batch_num)

    print("| computing adversarial stats...")
    if args.compute_stats:
        rb, ssim, sc = adversary.compute_stats(all_inputs, all_outputs, status)
        print('| average robustness = ' + str(rb))
        print('| average SSIM = ' + str(ssim))
        print('| success rate = ' + str(sc))

    # Unnormalize before saving
    unnormalize = Unnormalize(args.data_params['MEAN_STD']['MEAN'],
                                args.data_params['MEAN_STD']['STD'])
    all_inputs = unnormalize(all_inputs)
    all_outputs = unnormalize(all_outputs)
    # save output
    output_file = get_adversarial_file_path(
        args, args.adversarial_root, defense_name, adv_params,
        data_indices['end_idx'], start_idx=data_indices['start_idx'],
        with_defense=False)
    print("| Saving adversarial data at " + output_file)
    if not os.path.isdir(args.adversarial_root):
        os.makedirs(args.adversarial_root)
    torch.save({'status': status, 'all_inputs': all_inputs,
                'all_outputs': all_outputs, 'all_targets': all_targets},
                output_file)
def build_model(dataset,
                pipeline,
                experiment,
                param_grid=None,
                cv=5,
                scoring='accuracy',
                n_jobs='auto',
                test_size=0.3,
                use_target=None,
                expanding_window=False):
    # Define log file path for this run
    log_file = './results/{}_{}_{}/model_build.log'.format(
        dataset, pipeline, experiment)
    os.makedirs('./results/{}_{}_{}'.format(dataset, pipeline, experiment),
                exist_ok=True)
    # Setup logging
    logger.setup(filename=log_file,
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_model')

    # Load the dataset index
    dataset_index = load_dataset(dataset, return_index=True)
    # Dynamically import the pipeline we want to use for building the model
    p = importlib.import_module('pipelines.' + pipeline)

    # Parameter grid argument:
    # - If None, use the pipeline-defined grid
    # - If string, parse it as JSON
    # - If dict, use it as-is (do nothing)
    if param_grid == None:
        param_grid = p.PARAMETER_GRID
    elif type(param_grid) is 'str':
        with open(param_grid, 'r') as f:
            param_grid = json.load(f)
    # Target argument
    # Determines the target feature name (the system supports different classification targets)
    # If not supplied, use the pipeline-defined target.
    current_target = p.TARGET if not use_target else use_target

    logger.info('Start processing: {} using {} on {} with target {}'.format(
        experiment, pipeline, dataset, current_target))
    reports = ReportCollection(dataset, pipeline, experiment)
    for _sym, data in dataset_index.items():
        try:
            logger.info('Start processing: {}'.format(_sym))
            # ToDo: use lib.dataset.features.load_symbol instead of manually reading csv's
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            targets = pd.read_csv(data['target_csv'],
                                  sep=',',
                                  encoding='utf-8',
                                  index_col='Date',
                                  parse_dates=True)

            # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
            # replace infinity values with nan so that they can later be imputed to a finite value
            # in the pipeline's "Inputing" stage
            features = features.dropna(axis='columns',
                                       how='all').replace([np.inf, -np.inf],
                                                          np.nan)
            target = targets.loc[features.index][current_target]

            # Split available data in train and test set.
            # Perform grid search with cross-validation on the training set,
            # Then ToDo: instantiate a MLStrategy and test the model on the test set, in sliding window fashion
            X_train, X_test, y_train, y_test = train_test_split(
                features.values,
                target.values,
                shuffle=False,
                test_size=test_size)

            # Log before and after grid search to track execution time
            # Grid search logic moved to its own method for cleanliness
            logger.info("Start Grid search")
            gscv = grid_search(p.estimator,
                               param_grid,
                               X_train,
                               y_train,
                               cv=cv,
                               n_jobs=n_jobs,
                               expanding_window=expanding_window,
                               scoring=scoring)
            logger.info("End Grid search")
            labels, predictions = test_model(p.estimator, gscv.best_params_,
                                             30, X_train, y_train, X_test,
                                             y_test)
            report = classification_report(labels,
                                           predictions,
                                           output_dict=True)

            # Create a Report instance from the grid search results, and add it to this experiment's collection
            _report = Report(_sym, current_target, cv)
            _report.set_close(targets.loc[features.index].close)
            _report.set_dataset_columns(features.columns)
            _report.set_train_dataset(X_train, y_train)
            _report.set_test_dataset(X_test, y_test)
            _report.set_model(p.estimator)
            _report.set_params(gscv.best_params_)
            _report.set_cv(gscv.best_estimator_, gscv.best_score_,
                           gscv.cv_results_)
            reports.add_report(_report)
            reports.save()

            logger.info("--- {} end ---".format(_sym))
        except Exception as e:
            logger.error(
                "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}"
                .format(pipeline, dataset, _sym, e))
            traceback.print_exc()
    return reports
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment)
    if ',' in scoring:
        scoring = scoring.split(',')
    # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias
    if scoring == 'precision':
        scoring = make_scorer(precision_score, zero_division=1)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(
        filename=log_file,
        filemode='w',
        root_level=logging.DEBUG,
        log_level=logging.DEBUG,
        logger='build_model'
    )
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset, return_index=True, index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    p = importlib.import_module('pipelines.' + pipeline)
    experiment_index = {}

    if n_jobs == 'auto':
        n_jobs = os.cpu_count()
    # Load parameter grid argument
    if param_grid == None:
        param_grid = p.PARAMETER_GRID
    elif type(param_grid) is 'str':
        with open(param_grid, 'r') as f:
            param_grid = json.load(f)

    logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset))
    for _sym, data in dataset_index.items():
        logger.info('Start processing: {}'.format(_sym))
        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        current_target = p.TARGET if not use_target else use_target

        # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
        # replace infinity values with nan so that they can later be imputed to a finite value
        features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan)
        target = targets.loc[features.index][current_target]

        features = features.replace([np.inf, -np.inf], np.nan)
        imputer = SimpleImputer()
        imputer.fit(features.values)
        feat_imp_values = imputer.transform(features.values)
        features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns)
        X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size)
        # Summarize distribution
        logger.info("Start Hyperopt search")
        if expanding_window:
            cv = TimeSeriesSplit(n_splits=expanding_window)
        #cv = sliding_window_split(X_train, 0.1)
        est = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)
        est.fit(X_train, y_train)
        logger.info("End Hyperopt search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = est.best_model()['learner']
        best_score = est.score(X_train, y_train)
        best_params = {}

        # Plot learning curve for the classifier
        #est = p.estimator
        #est.set_params(**best_params)

        _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True)
        #plt.tight_layout()
        _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ]
        #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv)

        axes[1][0].set_title("{} - ROC (Train)".format(_sym))
        plot_roc_curve(clf, X_train, y_train, ax=axes[1][0])
        axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1])
        axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym))
        plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2])

        axes[2][0].set_title("{} - ROC (Test)".format(_sym))
        plot_roc_curve(clf, X_test, y_test, ax=axes[2][0])
        axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1])
        axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym))
        plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2])

        curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym)
        plt.savefig(curve_path)
        plt.close()

        # Test ensemble's performance on training and test sets
        predictions1 = clf.predict(X_train)
        train_report = classification_report(y_train, predictions1, output_dict=True)
        logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1)))
        predictions2 = clf.predict(X_test)
        test_report = classification_report(y_test, predictions2, output_dict=True)
        logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2)))

        report = {
            'training_set': {
                'features':X_train.shape[1],
                'records':X_train.shape[0],
                'class_distribution': get_class_distribution(y_train),
                'classification_report': train_report,
                'accuracy': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'precision': precision_score(y_train, predictions1),
                'recall': recall_score(y_train, predictions1),
                'f1': f1_score(y_train, predictions1),
                'y_true':[y for y in y_train],
                'y_pred':[y for y in predictions1]
            },
            'test_set': {
                'features':X_test.shape[1],
                'records':X_test.shape[0],
                'class_distribution':get_class_distribution(y_test),
                'classification_report': test_report,
                'accuracy': accuracy_score(y_test, predictions2),
                'precision': precision_score(y_test, predictions2),
                'mse': mean_squared_error(y_test, predictions2),
                'recall': recall_score(y_test, predictions2),
                'f1': f1_score(y_test, predictions2),
                'y_true': [y for y in y_test],
                'y_pred': [y for y in predictions2]
            }
        }
        # If the classifier has a feature_importances attribute, save it in the report
        feature_importances = None
        if hasattr(clf, 'feature_importances_'):
            feature_importances = clf.feature_importances_
        elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'):
            feature_importances = clf.named_steps.c.feature_importances_
        if feature_importances is not None:
            importances = {features.columns[i]: v for i, v in enumerate(feature_importances)}
            labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])}
            report['feature_importances'] = labeled
        if hasattr(clf, 'ranking_'):
            report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)}
        if hasattr(clf, 'support_'):
            report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s]
        train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()]
        test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()]

        logger.info('Model evaluation: \n'
              '== Training set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              '== Test set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist),
                      report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'],
                      report['training_set']['recall'], report['training_set']['f1'],
                      X_test.shape[1], X_test.shape[0], '\n'.join(test_dist),
                      report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'],
                      report['test_set']['recall'], report['test_set']['f1']
                      )
        )

        # Save a pickle dump of the model
        model_path = '{}{}.p'.format(models_dir, _sym)
        with open(model_path, 'wb') as f:
            pickle.dump(clf, f)
        # Save the model's parameters
        params_path = '{}{}_parameters.json'.format(models_dir, _sym)
        with open(params_path, 'w') as f:
            json.dump(best_params, f, indent=4)
        # Save the report for this model
        report_path = '{}{}.json'.format(reports_dir, _sym)
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=4)
        # Update the experiment's index with the new results, and save it
        experiment_index[_sym] = {
            'model':model_path,
            'params':params_path,
            'report':report_path
        }
        with open(experiment_index_file, 'w') as f:
            json.dump(experiment_index, f, indent=4)
        logger.info("--- {} end ---".format(_sym))
    return experiment_index
Example #14
0
def build(source_index, dest_index, W=10):
    _dataset = load_dataset(source_index, return_index=True)
    for _sym, entry in _dataset.items():
        _df = pd.read_csv(entry['csv'],
                          sep=',',
                          encoding='utf-8',
                          index_col='Date',
                          parse_dates=True)
        _target = pd.read_csv(entry['target_csv'],
                              sep=',',
                              encoding='utf-8',
                              index_col='Date',
                              parse_dates=True)

        ta = _df[entry['features']['ta']]
        cm = _df[entry['features']['cm']]

        # Price history facet (Daily variation of ohlc in last W trading days)
        ohlc = _df.loc[:, ['open', 'high', 'low', 'close']]
        ohlc['open'] = STL(ohlc.open).fit().resid
        ohlc['high'] = STL(ohlc.high).fit().resid
        ohlc['low'] = STL(ohlc.low).fit().resid
        ohlc['close'] = STL(ohlc.close).fit().resid
        ohlc.columns = ['open_resid', 'high_resid', 'low_resid', 'close_resid']
        history_facet = pd.concat(
            [ohlc] + [builder.make_lagged(ohlc, i) for i in range(1, W + 1)],
            axis='columns',
            verify_integrity=True,
            sort=True,
            join='inner')
        # Price trend facet (REMA/RSMA, MACD, AO, ADX, WD+ - WD-)
        trend_facet = ta[[
            "rsma_5_20", "rsma_8_15", "rsma_20_50", "rema_5_20", "rema_8_15",
            "rema_20_50", "macd_12_26", "ao_14", "adx_14", "wd_14"
        ]]
        # Volatility facet (CMO, ATRp)
        volatility_facet = ta[["cmo_14", "atrp_14"]]
        # Volume facet (Volume pct, PVO, ADI, OBV)
        volume_facet = pd.concat([
            _df.volume.pct_change().replace([np.inf, -np.inf], 0),
            ta[["pvo_12_26", "adi", "obv"]]
        ],
                                 axis='columns',
                                 verify_integrity=True,
                                 sort=True,
                                 join='inner')
        # On-chain facet
        cm_1 = cm.reindex(columns=[
            'adractcnt', 'txtfrvaladjntv', 'isstotntv', 'feetotntv', 'splycur',
            'hashrate', 'difficulty', 'txtfrcount'
        ]).pct_change()
        cm_2 = cm.reindex(columns=['isscontpctann'])
        chain_facet = pd.concat([cm_1, cm_2],
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')

        # Drop columns whose values are all nan or inf from each facet
        with pd.option_context('mode.use_inf_as_na',
                               True):  # Set option temporarily
            history_facet = history_facet.dropna(axis='columns', how='all')
            trend_facet = trend_facet.dropna(axis='columns', how='all')
            volatility_facet = volatility_facet.dropna(axis='columns',
                                                       how='all')
            volume_facet = volume_facet.dropna(axis='columns', how='all')
            chain_facet = chain_facet.dropna(axis='columns', how='all')

        improved_df = pd.concat([
            history_facet, trend_facet, volatility_facet, volume_facet,
            chain_facet
        ],
                                axis='columns',
                                verify_integrity=True,
                                sort=True,
                                join='inner')
        # Drop the first 30 rows
        #improved_df = improved_df[30:]

        # Add symbol to index
        feature_groups = {
            'price_history': [c for c in history_facet.columns],
            'trend': [c for c in trend_facet.columns],
            'volatility': [c for c in volatility_facet.columns],
            'volume': [c for c in volume_facet.columns],
            'chain': [c for c in chain_facet.columns],
        }
        logger.info('Saving {}'.format(_sym))
        save_symbol_dataset(dest_index,
                            _sym,
                            improved_df,
                            feature_groups=feature_groups,
                            target=_target)
        logger.info('Saved {}'.format(_sym))
Example #15
0
def classify_images(args):

    # assertions
    assert args.ensemble is None or args.ensemble in ENSEMBLE_TYPE, \
        "{} not a supported type. Only supported ensembling are {}".format(
            args.ensemble, ENSEMBLE_TYPE)
    if not args.ensemble:
        assert args.ncrops is None or (len(args.ncrops) == 1
                                       and args.ncrops[0] == 1)
    if args.defenses is not None:
        for d in args.defenses:
            assert DefenseType.has_value(d), \
                "\"{}\" defense not defined".format(d)
        # crops expected for each defense
        assert (args.ncrops is None or len(args.ncrops) == len(
            args.defenses)), ("Number of crops for each defense is expected")
        assert (args.crop_type is None or len(args.crop_type) == len(
            args.defenses)), ("crop_type for each defense is expected")
        # assert (len(args.crop_frac) == len(args.defenses)), (
        #     "crop_frac for each defense is expected")
    elif args.ncrops is not None:
        # no crop ensembling when defense is None
        assert len(args.ncrops) == 1
        assert args.crop_frac is not None and len(args.crop_frac) == 1, \
            "Only one crop_frac is expected as there is no defense"
        assert args.crop_type is not None and len(args.crop_type) == 1, \
            "Only one crop_type is expected as there is no defense"

    if args.defenses is None or len(args.defenses) == 0:
        defenses = [None]
    else:
        defenses = args.defenses

    all_defense_probs = None
    for idx, defense_name in enumerate(defenses):
        # initialize dataset
        defense = get_defense(defense_name, args)
        # Read preset params for adversary based on args
        adv_params = constants.get_adv_params(args, idx)
        print("| adv_params: ", adv_params)
        # setup crop
        ncrops = 1
        crop_type = None
        crop_frac = 1.0
        if args.ncrops:
            crop_type = args.crop_type[idx]
            crop_frac = args.crop_frac[idx]
            if crop_type == 'sliding':
                ncrops = 9
            else:
                ncrops = args.ncrops[idx]
        # Init custom crop function
        crop = transforms.Crop(crop_type, crop_frac)
        # initialize dataset
        dataset = load_dataset(args, 'valid', defense, adv_params, crop)
        # load model
        model, _, _ = get_model(args,
                                load_checkpoint=True,
                                defense_name=defense_name)

        # get crop probabilities for crops for current defense
        probs, targets = _eval_crops(args, dataset, model, defense, crop,
                                     ncrops, crop_type)

        if all_defense_probs is None:
            all_defense_probs = torch.zeros(len(defenses), len(dataset),
                                            probs.size(2))
        # Ensemble crop probabilities
        if args.ensemble == 'max':
            probs = torch.max(probs, dim=0)[0]
        elif args.ensemble == 'avg':  # for average ensembling
            probs = torch.mean(probs, dim=0)
        else:  # for no ensembling
            assert all_defense_probs.size(0) == 1
            probs = probs[0]
        all_defense_probs[idx, :, :] = probs

        # free memory
        dataset = None
        model = None

    # Ensemble defense probabilities
    if args.ensemble == 'max':
        all_defense_probs = torch.max(all_defense_probs, dim=0)[0]
    elif args.ensemble == 'avg':  # for average ensembling
        all_defense_probs = torch.mean(all_defense_probs, dim=0)
    else:  # for no ensembling
        assert all_defense_probs.size(0) == 1
        all_defense_probs = all_defense_probs[0]
    # Calculate top1 and top5 accuracy
    prec1, prec5 = accuracy(all_defense_probs, targets, topk=(1, 5))
    print('=' * 50)
    print('Results for model={}, attack={}, ensemble_type={} '.format(
        args.model, args.adversary, args.ensemble))
    prec1 = prec1[0]
    prec5 = prec5[0]
    print('| classification accuracy @1: %2.5f' % (prec1))
    print('| classification accuracy @5: %2.5f' % (prec5))
    print('| classification error @1: %2.5f' % (100. - prec1))
    print('| classification error @5: %2.5f' % (100. - prec5))
    print('| done.')
Example #16
0
def build_model(dataset,
                pipeline,
                experiment,
                current_target='class',
                test_size=0.3):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline,
                                                     experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline,
                                                       experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(
        dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(
        dataset, pipeline, experiment)

    scoring = make_scorer(precision_score, zero_division=1, average='micro')
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(filename=log_file,
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_model')
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset,
                                 return_index=True,
                                 index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    logger.info('Start experiment: {} using {} on {} with target {}'.format(
        experiment, pipeline, dataset, current_target))
    reports = ReportCollection(dataset, pipeline, experiment)
    for _sym, data in {'BTC': dataset_index['BTC']}.items():
        try:
            logger.info('Start processing: {}'.format(_sym))
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            targets = pd.read_csv(data['target_csv'],
                                  sep=',',
                                  encoding='utf-8',
                                  index_col='Date',
                                  parse_dates=True)

            # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
            # replace infinity values with nan so that they can later be imputed to a finite value
            features = features.dropna(
                axis='columns', how='all').dropna().replace([np.inf, -np.inf],
                                                            np.nan)
            target = targets.loc[features.index][current_target]

            #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size)

            all_size = features.shape[0]
            train_size = int(all_size * (1 - test_size))
            features = detabularise(
                features[[c for c in features.columns if 'close' in c]])
            X_train = features.iloc[0:train_size]
            y_train = target.iloc[0:train_size]
            X_test = features.iloc[train_size:all_size]
            y_test = target.iloc[train_size:all_size]
            # Summarize distribution
            logger.info("Start Grid search")
            clf = ShapeletTransformClassifier(time_contract_in_mins=5)
            clf.fit(X_train, y_train)
            print('{} Score: {}'.format(_sym, clf.score(X_test, y_test)))
            pred = clf.predict(X_test)
            print(classification_report(y_test, pred))
            logger.info("End Grid search")

            logger.info("--- {} end ---".format(_sym))
        except Exception as e:
            logger.error(
                "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}"
                .format(pipeline, dataset, _sym, e))
            traceback.print_exc()
    return reports