def test_adaptive_random_forests_nb():
    stream = RandomTreeGenerator(tree_random_state=112,
                                 sample_random_state=112,
                                 n_classes=2)
    stream.prepare_for_use()

    learner = AdaptiveRandomForest(n_estimators=3,
                                   random_state=112,
                                   leaf_prediction='nb')

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(int(learner.predict(X)[0]))
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1
    last_version_predictions = [
        1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        1
    ]

    # Performance below does not need to be guaranteed. This check is set up so that anything that changes
    # to predictions are caught in the unit test. This helps prevent accidental changes.

    assert type(learner.predict(X)) == np.ndarray
    assert np.alltrue(predictions == last_version_predictions)

    expected_info = "AdaptiveRandomForest(binary_split=False, disable_weighted_vote=False,\n" \
                    "                     drift_detection_method=ADWIN(delta=0.001), grace_period=50,\n" \
                    "                     lambda_value=6, leaf_prediction='nb',\n" \
                    "                     max_byte_size=33554432, max_features=5,\n" \
                    "                     memory_estimate_period=2000000, n_estimators=3,\n" \
                    "                     nb_threshold=0, no_preprune=False, nominal_attributes=None,\n" \
                    "                     performance_metric='acc', random_state=112,\n" \
                    "                     remove_poor_atts=False, split_confidence=0.01,\n" \
                    "                     split_criterion='info_gain', stop_mem_management=False,\n" \
                    "                     tie_threshold=0.05,\n" \
                    "                     warning_detection_method=ADWIN(delta=0.01))"
    assert learner.get_info() == expected_info
Exemple #2
0
def test_adaptive_random_forests():
    stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112)
    stream.prepare_for_use()

    learner = AdaptiveRandomForest(n_estimators=3,
                                   random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(int(learner.predict(X)[0]))
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        learner.partial_fit(X, y)
        cnt += 1

        last_version_predictions = [1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
                            1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
                            1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0]

    # Performance below does not need to be guaranteed. This check is set up so that anything that changes
    # to predictions are caught in the unit test. This helps prevent accidental changes.
    # If these tests fail, make sure that what is worked on *should* change the predictions of ARF.
    if sys.version_info.major == 3 and sys.version_info.minor >= 6:
        #  Temporary disable as pre-3.6 give different predictions than 3.6+
        assert np.alltrue(predictions == last_version_predictions)

    assert type(learner.predict(X)) == np.ndarray
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    name = '-'.join([options.moa_learner, str(options.concept_limit), 'py'])
    print(name)
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    for fn in fns:
        if fn.split('.')[-1] == 'ARFF':
            actual_fn = fn.split(os.sep)[-1]
            fn_path = os.sep.join(fn.split(os.sep)[:-1])
            print(actual_fn)
            print(fn_path)
            pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle"
            pickle_full_fn = os.sep.join([fn_path, pickle_fn])
            csv_fn = f"{name}.csv"
            csv_full_fn = os.sep.join([fn_path, csv_fn])
            print(csv_full_fn)
            if os.path.exists(pickle_full_fn):
                skip_file = False
                if os.path.exists(csv_full_fn):
                    if os.path.getsize(csv_full_fn) > 2000:
                        skip_file = True
                if not skip_file:
                    datastream_filename = fn
                    datastream_pickle_filename = pickle_full_fn
                    break
                else:
                    print('csv exists')
    if datastream_filename == None:
        print('Not datastream file')
        return
    print(datastream_filename)

    bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}"
    if not os.path.exists(bat_filename) or True:
        with open(f'{datastream_pickle_filename}', 'rb') as f:
            concept_chain = pickle.load(f)
        print(concept_chain)
        concepts = sorted(list(concept_chain.keys()))
        num_examples = concepts[-1] + (concepts[-1] - concepts[-2])
        stream_string = moaLink.get_moa_stream_from_filename(
            os.sep.join(datastream_filename.split(os.sep)[:-1]),
            datastream_filename.split(os.sep)[-1])
        moa_string = moaLink.make_moa_command(stream_string,
                                              options.moa_learner,
                                              options.concept_limit,
                                              'int',
                                              num_examples,
                                              config.report_window_length,
                                              options.experiment_directory,
                                              is_bat=not options.using_linux)
        moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux)
        # datastream = None
    t_start = process_time()
    command = f"{bat_filename} {options.moa_location}"
    print(command)
    print(options.moa_learner)
    if options.moa_learner != 'arf':
        if options.using_linux:

            subprocess.run(['chmod', '+x', bat_filename])
            subprocess.run([bat_filename, options.moa_location])
        else:
            subprocess.run(command)
    else:
        datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}"
        data = arff.loadarff(datastream_filename)
        df = pd.DataFrame(data[0], dtype='float64')
        df['y0'] = df['y0'].astype('int64')
        # df["y0"] = df["y0"].astype('category')
        print(df.info())
        datastream = DataStream(df)
        datastream.prepare_for_use()

        print(datastream.target_values)
        learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit))
        right = 0
        wrong = 0
        overall_log = []
        while datastream.has_more_samples():
            X, y = datastream.next_sample()
            prediction = learner.predict(X)
            is_correct = prediction[0] == y[0]
            if is_correct:
                right += 1
            else:
                wrong += 1
            learner.partial_fit(X, y)
            if (right + wrong) > 0 and (right + wrong) % 200 == 0:
                overall_log.append((right + wrong, right / (right + wrong)))
                print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r',
                      end="")
        overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy'])
        overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv")
        print("")
        print(f'Accuracy: {right / (right + wrong)}')
    #fsm, system_stats, concept_chain, ds, stream_examples =  fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True)
    t_stop = process_time()
    print("")
    print("Elapsed time during the whole program in seconds:",
          t_stop - t_start)
class ARFPredictor(AbstractPredictor):
    def __init__(self, n_estimators=10, max_features='auto', grace_period=50):
        super(ARFPredictor, self).__init__()
        self._n_estimators = n_estimators
        self._max_features = max_features
        self._classifier = AdaptiveRandomForest(
            n_estimators=self._n_estimators,
            max_features=self._max_features,
            grace_period=grace_period,
            random_state=42)
        self._trained_samples = 0

    def predict(self, training_data, training_labels, todays_features,
                training_years, trading_days_per_year) -> int:
        self._classifier.partial_fit(training_data[self._trained_samples:],
                                     training_labels[self._trained_samples:],
                                     np.unique(training_labels))
        self._trained_samples += len(training_labels[self._trained_samples:])

        prediction = self._classifier.predict([todays_features])
        return prediction[-1]

    def tune(self,
             stock_data,
             symbols,
             num_features,
             measure='f1',
             trading_frequency=10,
             training_years=3,
             trading_days_per_year=246):
        gp_space = [10, 15, 20]
        n_est_space = [50, 100]
        sqrt_feat = sqrt(num_features)
        max_f_space = [round(sqrt_feat)]
        accuracies = []
        f_scores = []
        params = []
        for gp in gp_space:
            for n_estimators in n_est_space:
                for max_features in max_f_space:
                    accs = []
                    fs = []
                    for stock in symbols:
                        print(stock)
                        self._n_estimators = n_estimators
                        self._max_features = max_features
                        self._trained_samples = 0
                        self._classifier = AdaptiveRandomForest(
                            n_estimators=self._n_estimators,
                            max_features=self._max_features,
                            grace_period=gp,
                            random_state=42)

                        preds = []

                        X = stock_data[stock][[
                            'I{}'.format(x)
                            for x in range(1, num_features + 1)
                        ]]
                        y = stock_data[stock]['target']
                        first_trading_day = training_years * trading_days_per_year + trading_frequency
                        last_trading_day = first_trading_day - trading_frequency
                        for trading_day in range(first_trading_day, len(y)):
                            sys.stdout.write('\r%i/%i' % (trading_day, len(y)))
                            sys.stdout.flush()
                            preds.append(
                                self.predict(X.iloc[:last_trading_day].values,
                                             y.iloc[:last_trading_day].values,
                                             X.iloc[trading_day],
                                             training_years,
                                             trading_days_per_year))
                            last_trading_day = trading_day
                        print('')
                        preds = pd.Series(preds, name='Predictions')

                        accuracy = metrics.accuracy_score(
                            y.iloc[first_trading_day:], preds)
                        accs.append(accuracy)

                        f_score = metrics.f1_score(y.iloc[first_trading_day:],
                                                   preds)
                        fs.append(f_score)

                    mean_acc = np.array(accs).mean()
                    accuracies.append(mean_acc)

                    mean_f_score = np.array(fs).mean()
                    f_scores.append(mean_f_score)

                    params.append((n_estimators, max_features, gp))
                    print('\nARF  n_estimators=%i max_features=%i gp=%i' %
                          (n_estimators, max_features, gp))
                    print('ARF    Accuracy: %.3f' % mean_acc)
                    print('ARF    F1 Score: %.3f' % mean_f_score)
        accuracies = np.array(accuracies)
        f_scores = np.array(f_scores)
        print('\nARF FINAL RESULTS')
        if measure == 'accuracy':
            (n_estimators, max_features, gp) = params[accuracies.argmax()]
            print(
                'ARF  Best result: accuracy %.3f (n_estimators=%i max_features=%i gp=%i)'
                % (accuracies.max(), n_estimators, max_features, gp))
        elif measure == 'f1':
            (n_estimators, max_features, gp) = params[f_scores.argmax()]
            print(
                'ARF  Best result: f score %.3f (n_estimators=%i max_features=%i gp=%i)'
                % (f_scores.max(), n_estimators, max_features, gp))
        else:
            raise NotImplementedError
        return [n_estimators, max_features]