Beispiel #1
0
def main():
    args = parse_args()

    data_path = Path(args.data_folder)
    output_path = Path(args.output)
    output_path.mkdir(parents=True)

    interval_size_scorer = IntervalScorer(mean_interval_size,
                                          {"confidence": args.confidence})
    error_rate_scorer = IntervalScorer(mean_error_rate,
                                       {"confidence": args.confidence})

    scorers = {
        "mean_interval_size": interval_size_scorer,
        "mean_error_rate": error_rate_scorer
    }

    for filepath in data_path.glob("*.arff"):
        X, y = load_arff_data(filepath)
        print(X.shape)
        for i in range(args.repeats):
            mfr = MondrianForestRegressor(n_estimators=args.n_estimators)
            results = prequential_evaluation(mfr, X, y, scorers,
                                             args.window_size)
            out_file = output_path / (filepath.stem + "_{}.json".format(i))
            results["arguments"] = args
            results["learner_params"] = mfr.get_params()
            out_file.write_text(json.dumps(results))
Beispiel #2
0
    def __init__(self, batch_size):
        """

        :param batch_size: Integer value, defined by the competition and available at competition page
        :param server_port: Connection string ('IP:port')
        :param user_email: String, e-mail used for registering to competition
        :param token: String, received after subscription to a competition
        :param competition_code: String, received after subscription to a competition
        :param first_prediction: Prediction, class generated from .proto file. Used to initiate communication with the
        server. Not influencing the results. Should contain appropriate fields from .proto file.
        """

        # mondrian
        self.mfr = MondrianForestRegressor(random_state=1,
                                           n_estimators=100,
                                           bootstrap=True)
        self.previous_target_3 = pd.Series()
        self.features_for_rowID = Queue()
        self.previous_train_batch = np.array([-1, -1, -1, -1, -1])
        # rrcf
        self.num_trees = 40
        self.tree_size = 256
        self.forest = []
        self.avg_codisp = {}
        self.curr_sum = 0
        self.curr_num = 0
        self.idx = 0

        self._init_modeling()

        a = 1
        while a == 1:
            print("wait")
            now = datetime.datetime.now()
            starttime = now.replace(hour=21, minute=0, second=0, microsecond=0)
            if now >= starttime:
                print(now)
                print("시작!")
                break

        self.batch_size = batch_size
        self.stop_thread = False
        self.predictions_to_send = Queue()
        self.channel = grpc.insecure_channel(
            'app.streaming-challenge.com:50051')
        self.stub = file_pb2_grpc.DataStreamerStub(self.channel)
        self.user_email = '*****@*****.**'
        self.competition_code = 'jR'  #oj
        self.token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoieXU5OTA1MjRAZ21haWwuY29tIiwiY29tcGV0aXRpb25faWQiOiIxIn0.B7CAjAsEbTjp4l1K4GR1Y0IJZj6_mKEbKBXsXXJmGBg'
        self.predictions_to_send.put(
            file_pb2.Prediction(rowID=1000, target=333))
        self.metadata = self.create_metadata(user_id=self.user_email,
                                             code=self.competition_code,
                                             token=self.token)
Beispiel #3
0
def test_partial_fit_equivalence():
    X, y = make_regression(random_state=0, n_samples=100)
    mfr = MondrianForestRegressor(random_state=0)
    mfr.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mfr, 0, X, y)

    X, y = make_classification(random_state=0, n_samples=100)
    mtc = MondrianForestClassifier(random_state=0)
    mtc.partial_fit(X, y)
    for batch_size in [10, 20, 25, 50, 90]:
        check_partial_fit_equivalence(batch_size, mtc, 0, X, y, is_clf=True)
Beispiel #4
0
def test_fit_after_partial_fit():
    rng = np.random.RandomState(0)
    X = rng.randn(10, 5)
    y = np.floor(rng.randn(10))
    mfr = MondrianForestRegressor(random_state=0)
    check_fit_after_partial_fit(mfr, X, y)

    mfc = MondrianForestClassifier(random_state=0)
    check_fit_after_partial_fit(mfc, X, y)
Beispiel #5
0
def test_min_samples_split():
    X_c, y_c = load_digits(return_X_y=True)
    X_r, y_r = make_regression(n_samples=10000, random_state=0)

    for mss in [2, 4, 10, 20]:
        mfr = MondrianForestRegressor(random_state=0, min_samples_split=mss)
        mfr.partial_fit(X_r[:X_r.shape[0] // 2], y_r[:X_r.shape[0] // 2])
        mfr.partial_fit(X_r[X_r.shape[0] // 2:], y_r[X_r.shape[0] // 2:])
        for est in mfr.estimators_:
            n_node_samples = est.tree_.n_node_samples[
                est.tree_.children_left != -1]
            assert_greater(np.min(n_node_samples) + 1, mss)

        mfc = MondrianForestClassifier(random_state=0, min_samples_split=mss)
        mfc.partial_fit(X_c[:X_c.shape[0] // 2], y_c[:X_c.shape[0] // 2])
        mfc.partial_fit(X_c[X_c.shape[0] // 2:], y_c[X_c.shape[0] // 2:])
        for est in mfc.estimators_:
            n_node_samples = est.tree_.n_node_samples[
                est.tree_.children_left != -1]
            assert_greater(np.min(n_node_samples) + 1, mss)
Beispiel #6
0
def test_forest_attributes():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2])
    assert_false(hasattr(mr, "classes_"))
    assert_false(hasattr(mr, "n_classes_"))

    mr = MondrianForestClassifier(n_estimators=5, random_state=0)
    mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2])
    assert_true(hasattr(mr, "classes_"))
    assert_true(hasattr(mr, "n_classes_"))
Beispiel #7
0
def test_quantile_toy_data():
    rng = np.random.RandomState(1)
    x1 = rng.randn(1, 10)
    X1 = np.tile(x1, (10000, 1))
    x2 = 20.0 * rng.randn(1, 10)
    X2 = np.tile(x2, (10000, 1))
    X = np.concatenate((X1, X2))

    y1 = rng.randn(10000)
    y2 = 5.0 + rng.randn(10000)
    y = np.concatenate((y1, y2))

    est = MondrianForestRegressor(random_state=1)

    # est.set_params(max_depth=1)
    est.fit(X, y)
    for quantile in range(10, 90, 10):
        tree_quantile = 0.01 * quantile

        assert_array_almost_equal(
            est.predict_quantile(x1, quantile=tree_quantile),
            [np.percentile(y1, quantile)], 2)
        assert_array_almost_equal(
            est.predict_quantile(x2, quantile=tree_quantile),
            [np.percentile(y2, quantile)], 2)
Beispiel #8
0
def check_partial_fit_equivalence(size_batch,
                                  f,
                                  random_state,
                                  X,
                                  y,
                                  is_clf=False):
    start_ptr = list(range(0, 100, size_batch))
    end_ptr = start_ptr[1:] + [100]
    if not is_clf:
        p_f = MondrianForestRegressor(random_state=random_state)
    else:
        p_f = MondrianForestClassifier(random_state=random_state)
    for start, end in zip(start_ptr, end_ptr):
        p_f.partial_fit(X[start:end], y[start:end])
    for est, p_est in zip(f.estimators_, p_f.estimators_):
        assert_array_equal(p_est.tree_.n_node_samples,
                           est.tree_.n_node_samples)
        assert_array_equal(p_est.tree_.threshold, est.tree_.threshold)
        assert_array_equal(p_est.tree_.feature, est.tree_.feature)
        assert_equal(p_est.tree_.root, est.tree_.root)
        assert_array_equal(p_est.tree_.value, est.tree_.value)
        assert_equal(est.tree_.n_node_samples[est.tree_.root], 100)
        assert_equal(p_est.tree_.n_node_samples[est.tree_.root], 100)
Beispiel #9
0
def test_interval_scorer():
    # Fit a simple linear model
    n_samples = 200
    n_features = 10
    rng = np.random.RandomState(0)
    X = rng.normal(size=(n_samples, n_features))
    w = rng.normal(size=n_features)
    # simple linear function without noise
    y = np.dot(X, w)

    mfr = MondrianForestRegressor()
    mfr.fit(X, y)
    # Create a scorer that measures the mean interval size
    interval_size_scorer = IntervalScorer(mean_interval_size,
                                          sign=-1,
                                          kwargs={'confidence': 0.9})
    # Get prediction intervals
    intervals = mfr.predict_interval(X, 0.9)

    interval_size = intervals[:, 1] - intervals[:, 0]
    calc_mean = np.mean(interval_size)
    # Ensure the scorer performs the correct calculation
    assert_almost_equal(interval_size_scorer(mfr, X, y), -1 * calc_mean)
Beispiel #10
0
def test_mean_std_forest_regressor():
    mfr = MondrianForestRegressor(random_state=0)
    mfr.fit(X, y)

    # For points completely in the training data.
    # and max depth set to None.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = mfr.predict(X, return_std=True)
    assert_array_almost_equal(mean, y, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (30.0 * np.ones(X.shape[1]), -30.0 * np.ones(X.shape[1])))
    inf_mean, inf_std = mfr.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y.mean(), 1)
    assert_array_almost_equal(inf_std, y.std(), 2)
from skgarden import MondrianForestClassifier
from skgarden import MondrianForestRegressor

train_test_split.__test__ = False

boston = load_boston()
# The time of split and feature chosen for splitting are highly
# scale-sensitive.
scaler = MinMaxScaler()
X, y = boston.data, boston.target

y = np.round(y)
X = scaler.fit_transform(X)

ensembles = [
    MondrianForestRegressor(random_state=0),
    MondrianForestClassifier(random_state=0)]


def check_boston(est):
    score = est.score(X, y)
    assert_greater(score, 0.94, "Failed with score = %f" % score)


def test_boston():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit(X, y)
    check_boston(mr)
    mr.partial_fit(X, y)
    check_boston(mr)
Beispiel #12
0
def test_boston():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit(X, y)
    score = mr.score(X, y)
    assert_greater(score, 0.94, "Failed with score = %f" % score)
Beispiel #13
0
class Client:
    """ gRPC Client class for streaming competition platform"""
    channel = None
    stub = None

    def __init__(self, batch_size):
        """

        :param batch_size: Integer value, defined by the competition and available at competition page
        :param server_port: Connection string ('IP:port')
        :param user_email: String, e-mail used for registering to competition
        :param token: String, received after subscription to a competition
        :param competition_code: String, received after subscription to a competition
        :param first_prediction: Prediction, class generated from .proto file. Used to initiate communication with the
        server. Not influencing the results. Should contain appropriate fields from .proto file.
        """

        # mondrian
        self.mfr = MondrianForestRegressor(random_state=1,
                                           n_estimators=100,
                                           bootstrap=True)
        self.previous_target_3 = pd.Series()
        self.features_for_rowID = Queue()
        self.previous_train_batch = np.array([-1, -1, -1, -1, -1])
        # rrcf
        self.num_trees = 40
        self.tree_size = 256
        self.forest = []
        self.avg_codisp = {}
        self.curr_sum = 0
        self.curr_num = 0
        self.idx = 0

        self._init_modeling()

        a = 1
        while a == 1:
            print("wait")
            now = datetime.datetime.now()
            starttime = now.replace(hour=21, minute=0, second=0, microsecond=0)
            if now >= starttime:
                print(now)
                print("시작!")
                break

        self.batch_size = batch_size
        self.stop_thread = False
        self.predictions_to_send = Queue()
        self.channel = grpc.insecure_channel(
            'app.streaming-challenge.com:50051')
        self.stub = file_pb2_grpc.DataStreamerStub(self.channel)
        self.user_email = '*****@*****.**'
        self.competition_code = 'jR'  #oj
        self.token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoieXU5OTA1MjRAZ21haWwuY29tIiwiY29tcGV0aXRpb25faWQiOiIxIn0.B7CAjAsEbTjp4l1K4GR1Y0IJZj6_mKEbKBXsXXJmGBg'
        self.predictions_to_send.put(
            file_pb2.Prediction(rowID=1000, target=333))
        self.metadata = self.create_metadata(user_id=self.user_email,
                                             code=self.competition_code,
                                             token=self.token)

    @staticmethod
    def create_metadata(user_id, code, token):
        """
        :param user_id:
        :param code:
        :param token:
        :return:
        """
        metadata = [(b'authorization', bytes(token, 'utf-8')),
                    (b'user_id', bytes(user_id, 'utf-8')),
                    (b'competition_id', bytes(code, 'utf-8'))]
        return metadata

    @staticmethod
    def create_forest(num_trees):

        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        return forest

    def partial_train(self, X_test, y_test):
        y_pred, y_std = self.mfr.predict(X_test, return_std=True)
        self.mfr.partial_fit(X_test, y_test)
        #print('pred : %f, std: %f, y: %f'%(y_pred, y_std, y_test))
        return y_pred, y_std

    def _init_modeling(self):
        network = pd.read_csv('initial_training_data.csv',
                              index_col='date',
                              parse_dates=['date'])

        self.forest = []
        for _ in range(self.num_trees):
            tree = rrcf.RCTree()
            self.forest.append(tree)

        train_len = len(network)
        #train_len = 1000
        train_start = 80000
        self.idx = 0

        print("start!")

        for index in range(train_start, train_len):
            point = float(network[index:index + 1].values)  # get one by one

            for tree in self.forest:
                if len(tree.leaves) > self.tree_size:
                    tree.forget_point(self.idx - self.tree_size)

                tree.insert_point(point, index=self.idx)

                if not index in self.avg_codisp:
                    self.avg_codisp[self.idx] = 0
                self.avg_codisp[self.idx] += tree.codisp(
                    self.idx) / self.num_trees

            # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균
            mean = np.array(list(self.avg_codisp.values())).mean()
            std = np.array(list(self.avg_codisp.values())).std()

            z = (self.avg_codisp[self.idx] - mean) / std
            self.idx += 1

            if z > 3.0 or z < -3.0:
                # if abs(z-score) is over 3.0
                # replace the value with the mean of prev 5 days
                network.iloc[index] = network[index - 5:index].mean()  #

        print("init_modeling에서 anomaly detection 완료")

        print("init_modeling에서 trainign 시작")
        for i in range(7 + train_start, train_len):
            X_train = pd.Series()
            X_train['prev1'] = float(network[i - 7:i - 6]['target'].values)
            X_train['prev2'] = float(network[i - 6:i - 5]['target'].values)
            X_train['prev3'] = float(network[i - 5:i - 4]['target'].values)
            y_train = (network[i:i + 1]['target'].values)
            self.mfr.partial_fit(X_train.values.reshape(1, -1), y_train)
        print("train 완료")

        self.previous_target_3['prev3'] = float(
            network[train_len - 8:train_len - 7]['target'].values)
        self.previous_target_3['prev2'] = float(
            network[train_len - 7:train_len - 6]['target'].values)
        self.previous_target_3['prev1'] = float(
            network[train_len - 6:train_len - 5]['target'].values)
        self.previous_train_batch = network[train_len -
                                            5:train_len]['target'].values

        print('endebded')

    def generate_predictions(self):
        """
        Sending predictions

        :return: Prediction
        """
        while True:
            try:
                prediction = self.predictions_to_send.get(block=True,
                                                          timeout=60)
                print("Prediction: ", prediction)
                yield prediction
            except queue.Empty:
                self.stop_thread = True
                break

    #check anomaly with RRCF
    def anomaly_detection(self, data):
        for tree in self.forest:
            if len(tree.leaves) > self.tree_size:
                tree.forget_point(self.idx - self.tree_size)

            tree.insert_point(data, index=self.idx)

            if not self.idx in self.avg_codisp:
                self.avg_codisp[self.idx] = 0
            self.avg_codisp[self.idx] += tree.codisp(self.idx) / self.num_trees
        # avg_codisp은 (각 tree 이 point를 anomaly로 생각하는 정도)의 평균
        mean = np.array(list(self.avg_codisp.values())).mean()
        std = np.array(list(self.avg_codisp.values())).std()

        z = (self.avg_codisp[self.idx] - mean) / std
        self.idx += 1
        if z > 3.0 or z < -3.0:
            return self.previous_train_batch.mean()
            # if abs(z-score) is over 3.0
            # replace the value with the mean of whole data we met

        else:
            return data
        #if not over 3.0, then no need to replace the value

    def loop_messages(self):
        """
        Getting messages (data instances) from the stream.

        :return:
        """

        #generate prediction -> get prediction from predictions_to_send one by one ans SEND to server

        messages = self.stub.sendData(self.generate_predictions(),
                                      metadata=self.metadata)
        test_idx = 0
        test_feature = self.previous_target_3

        try:
            for message in messages:

                message = json.loads(json_format.MessageToJson(message))
                print("message:", message)
                if message['tag'] == 'TEST':
                    print('test')
                    test_feature['prev3'] = test_feature['prev2']
                    test_feature['prev2'] = test_feature['prev1']
                    test_feature['prev1'] = float(
                        self.previous_train_batch[test_idx])

                    pred = self.mfr.predict(test_feature.values.reshape(1, -1))
                    prediction = file_pb2.Prediction(rowID=message['rowID'],
                                                     target=pred)
                    self.predictions_to_send.put(prediction)

                    #
                    test_idx = (test_idx + 1) % 5
                    print(test_idx)

                    print('test end')

                if message['tag'] == 'TRAIN':
                    print('train')
                    #training data to train my model.

                    target = message['target']
                    target = self.anomaly_detection(target)

                    print(self.previous_target_3)

                    # i-5, i-6, i-7 의 값을 갖고 학습
                    if self.previous_target_3['prev3'] < 0:
                        self.previous_target_3['prev3'] = target
                    elif self.previous_target_3['prev2'] < 0:
                        self.previous_target_3['prev2'] = target
                    elif self.previous_target_3['prev1'] < 0:
                        self.previous_target_3['prev1'] = target
                    else:
                        print('else')
                        #replace the oldest value
                        self.previous_target_3[
                            'prev3'] = self.previous_target_3['prev2']  #-7
                        self.previous_target_3[
                            'prev2'] = self.previous_target_3['prev1']  #-6
                        self.previous_target_3['prev1'] = float(
                            self.previous_train_batch[0])  #-5

                        # partial fit with 3 previous values as feature
                        self.mfr.partial_fit(
                            self.previous_target_3.values.reshape(1, -1),
                            [target])

                        #현재 train data의 target값 저장
                        self.previous_train_batch = np.roll(
                            self.previous_train_batch, -1)
                        self.previous_train_batch[4] = target

                        print('else end')

                    print('train end')

                if self.stop_thread: break

        except Exception as e:
            print(str(e))
            pass

    def run(self):
        """
        Start thread.
        """
        print("Start")
        t1 = Thread(target=self.loop_messages)
        t1.start()
Beispiel #14
0
import numpy as np
from sklearn.datasets import load_boston
X = load_boston(return_X_y=True)
X_train = X[0]
y_train = X[1]
#@print(X_train)
print(X_train.shape)
print(np.amax(X_train))
print(np.amin(X_train))

### Use MondrianForests for variance estimation
from skgarden import MondrianForestRegressor
mfr = MondrianForestRegressor()
mfr.fit(X_train, y_train)
y_mean, y_std = mfr.predict(X_train, return_std=True)
print(y_mean)
#print(y_std)

### Use QuantileForests for quantile estimation
#from skgarden import RandomForestQuantileRegressor
#rfqr = RandomForestQuantileRegressor(random_state=0)
#rfqr.fit(X, y)
#y_mean = rfqr.predict(X)
#y_median = rfqr.predict(X, 50)
Beispiel #15
0
            scaler_X = preprocessing.MinMaxScaler()
            features = scaler_X.fit_transform(features)

            features = pd.DataFrame(features)

            scaler_y = preprocessing.MinMaxScaler()
            labels = scaler_y.fit_transform(labels)
            labels = pd.DataFrame(labels)

            #REGRESSORS
            PAR = PassiveAggressiveRegressor()
            SGDR = SGDRegressor()
            MLPR = MLPRegressor()
            RHT = RegressionHoeffdingTree()
            RHAT = RegressionHAT()
            MFR = MondrianForestRegressor()
            MTR = MondrianTreeRegressor()

            regressors = [PAR, SGDR, MLPR, RHT, RHAT, MFR, MTR]  #7

            regressors_names = []
            for r in range(len(regressors)):
                reg_name = regressors[r].__class__.__name__

                if reg_name == 'PassiveAggressiveRegressor':
                    regressors_names.append('PAR')
                elif reg_name == 'SGDRegressor':
                    regressors_names.append('SGDR')
                elif reg_name == 'MLPRegressor':
                    regressors_names.append('MLPR')
                elif reg_name == 'RegressionHoeffdingTree':
def test_mean_std_forest_regressor():
    mfr = MondrianForestRegressor(random_state=0)
    mfr.fit(X, y)
    check_mean_std_forest_regressor(mfr)
    mfr.partial_fit(X, y)
    check_mean_std_forest_regressor(mfr)
def test_boston():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit(X, y)
    check_boston(mr)
    mr.partial_fit(X, y)
    check_boston(mr)
def RF_regressor(X_data,Y_data,options=None):
    from sklearn.ensemble import RandomForestRegressor

    ####################
    # Parse user options
    ####################
    params = {}
    gridsearch   = False
    GS_settings  = None
    randomsearch = False
    RS_settings  = None
    feature_selection = False
    accuracy = False
    cv_type = 'logo'
    scoring = 'neg_mean_absolute_error'
    mondrian = False
    search_std = False

    if (options is not None):

        if (("RF_parameters" in options)==True):
            params = options['RF_parameters']

        if (("grid_search" in options)==True):
            from sklearn.model_selection import GridSearchCV
            import time
            gridsearch = True
            GS_params   = options['grid_search']['parameter_grid']
            if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings']
            if (("search std" in options['grid_search'])==True):
                search_std = options['grid_search']['search std']

        if (("random_search" in options)==True):
            from sklearn.model_selection import RandomizedSearchCV
            from cfd2ml.utilities import convert_param_dist
            import time
            randomsearch = True
            RS_params, RS_Nmax   = convert_param_dist(options['random_search']['parameter_grid'])
            print('RS_Nmax = ', RS_Nmax)
            if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] 

        if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********')

        if (("feature_selection" in options)==True):
            from cfd2ml.utilities import RFE_perm
            feature_selection = True
            feats =  options['feature_selection']['feats']
#            if("step"         in options['feature_selection']): step         = options['feature_selection']['step']
#            if("min_features" in options['feature_selection']): min_features = options['feature_selection']['min_features']
            if(randomsearch==True or gridsearch==True): quit('******** Stopping! grid/random_search and feature selection both set ********')

        if (("accuracy" in options)==True):
            accuracy = options['accuracy']
            if (accuracy==True):
                from sklearn.model_selection import cross_validate
                from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

        if (("scoring" in options)==True):
            scoring = options['scoring']

        if (("cv_type" in options)==True):
            cv_type = options['cv_type']

        if (("mondrian" in options)==True):
            mondrian = options['mondrian']
            if mondrian: from skgarden import MondrianForestRegressor

    ##############
    # Prepare data
    ##############
    if(cv_type=='logo'): groups = X_data['group']
    X_data = X_data.drop(columns='group')

    # Find feature and target headers
    X_headers = X_data.columns
    Y_header  = Y_data.name

    nX = X_headers.size
    print('\nFeatures:')
    for i in range(0,nX):
        print('%d/%d: %s' %(i+1,nX,X_headers[i]) )
    print('\nTarget: ', Y_header)
  
    ########################
    # Prepare other settings
    ########################
    # Setting cross-validation type (either leave-one-group-out or 5-fold)
    if(cv_type=='logo'):
        from sklearn.model_selection import LeaveOneGroupOut
        logo = LeaveOneGroupOut()
        ngroup = logo.get_n_splits(groups=groups)
        print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups')
    elif(cv_type=='kfold'):
        from sklearn.model_selection import StratifiedKFold
        print('\nUsing 10-fold cross validation')
        k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        cv = k_fold.split(X_data,Y_data)

    #########################
    # Training the regressor
    #########################
    if(gridsearch==True):
        # Finding optimal hyperparameters with GridSearchCV
        if mondrian:
            print('\n Performing GridSearchCV to find optimal hyperparameters for mondrian forest regressor')
            regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False)
            if search_std: # MESSY HACK! Ignore "best model etc" if using this
                def my_scorer(model, X, y_true):
                    y_pred, y_sd = model.predict(X,return_std=True)
                    return np.mean(y_sd)
                scoring=my_scorer
        else:            
            print('\n Performing GridSearchCV to find optimal hyperparameters for random forest regressor')
            regr = RandomForestRegressor(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        GS_regr = GridSearchCV(estimator=regr,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings)
        GS_regr.fit(X_data,Y_data)

        # Write out results to file
        scores_df = pd.DataFrame(GS_regr.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('GridSearch_results.csv')

        # Pich out best results
        best_params = GS_regr.best_params_
        best_score  = GS_regr.best_score_
        regr = GS_regr.best_estimator_  # (this regr has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)

    elif(randomsearch==True):
        # Finding optimal hyperparameters with RandomSearchCV
        if mondrian:
            print('\n Performing RandomizedSearchCV to find optimal hyperparameters for mondrian forest regressor')
            regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False)
        else:            
            print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest regressor')
            regr = RandomForestRegressor(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        RS_regr = RandomizedSearchCV(estimator=regr,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings)
        RS_regr.fit(X_data,Y_data)
        
        # Write out results to file
        scores_df = pd.DataFrame(RS_regr.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('RandomSearch_results.csv')

        # Pick out best results
        best_params = RS_regr.best_params_
        best_score  = RS_regr.best_score_
        regr = RS_regr.best_estimator_  # (this regr has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)


    else:
        # Train RF regressor with hyperparameters given by user
        if mondrian:
            print('\nTraining mondrian forest regressor with given hyperparameters')
            regr = MondrianForestRegressor(**params,bootstrap=False)
        else:            
            print('\nTraining random forest regressor with given hyperparameters')
            regr = RandomForestRegressor(**params)

        # Feature selection before final fit
        if (feature_selection):
            if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
#            [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,cv=cv,scoring=scoring,step=step,min_features=min_features,timing=True)
            [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,feats,cv=cv,scoring=scoring,timing=True)

            if (scoring=='neg_mean_absolute_error'):
                scores = -scores
                bestscore  = -bestscore
            elif(scoring=='neg_mean_squared_error'):
                scores = np.sqrt(-scores)
                bestscore  = np.sqrt(-bestscore)
            import matplotlib.pyplot as plt
            plt.figure()
            plt.plot(nfeats,100*scores,lw=2)
            plt.xlabel('$N_{features}$')
            plt.ylabel('Score (%)')
            plt.figure()
            plt.plot(nfeats,traintimes,label='Training',lw=2)
            plt.plot(nfeats, predtimes,label='Prediction',lw=2)
            plt.xlabel('$N_{features}$')
            plt.ylabel('Time (s)')
            plt.legend()
            plt.show()

            print('Best score: %.2f' %(100*bestscore))
            print('Feature set:')
            print(X_headers[bestfeat])

            # Save results in CSV file
            featselect_df = pd.DataFrame(featsets,columns=X_headers)
            featselect_df['score'] = scores
            featselect_df['traintimes'] = traintimes
            featselect_df['predtimes'] = predtimes
            featselect_df['nfeats'] = nfeats
            featselect_df.to_csv('FeatSelect_results.csv')

            # cut down to optimial feature set
            X_data = X_data.iloc[:,bestfeat]

        # Fit model to data
        regr.fit(X_data,Y_data)

    # Cross validation accuracy metrics
    if(accuracy==True):
        print('\nPerforming cross validation to determine train and test accuracy/error')

        # Get generator object depending on cv strategy
        if (cv_type=='logo'): 
            cv = logo.split(X_data,Y_data,groups)
        elif(cv_type=='kfold'):
            cv = k_fold.split(X_data,Y_data)  # Need to regen "Generator" object

        from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

        # Init lists
        train_r2  = []
        test_r2   = []
        train_MAE = []
        test_MAE  = []
        train_MSE = []
        test_MSE  = []

        # Loop through CV folds
        i = 0
        for train_index, test_index in cv:
            X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
            Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]

            # Train regressor
            regr_cv = regr
            regr_cv.fit(X_train, Y_train)

            # Predict Y
            Y_pred_train = regr_cv.predict(X_train)
            Y_pred_test  = regr_cv.predict(X_test )

            # r2 scores
            r2score = r2_score(Y_test , Y_pred_test)
            train_r2.append(r2_score(Y_train, Y_pred_train) )
            test_r2.append(r2score)
            # Mean absolute error scores
            MAEscore = mean_absolute_error(Y_test , Y_pred_test)
            train_MAE.append(mean_absolute_error(Y_train, Y_pred_train) )
            test_MAE.append(MAEscore)
            # Mean squared error scores
            MSEscore = mean_squared_error(Y_test , Y_pred_test)
            train_MSE.append(mean_squared_error(Y_train, Y_pred_train) )
            test_MSE.append(MSEscore)

            # Print validation scores (training scores are stored to print mean later, but not printed for each fold)
            if(cv_type=='logo'):
                print('\nTest group = ', groups.iloc[test_index[0]])
            elif(cv_type=='kfold'):
                print('\nFold = ', i)
            print('-------------------')
            print('r2 score = %.2f %%' %(r2score*100) )
            print('Mean absolute error = %.2f %%' %(MAEscore*100) )
            print('Mean squared error = %.2f %%' %(MSEscore*100) )

            i += 1

        # Print performance scores
        print('\nMean training scores:')
        print('r2 score = %.2f %%' %(np.mean(train_r2)*100) )
        print('Mean absolute error = %.2f %%' %(np.mean(train_MAE)*100) )
        print('Mean squared error = %.2f %%' %(np.mean(train_MSE)*100) )
    
        print('\nMean validation scores:')
        print('r2 score = %.2f %%' %(np.mean(test_r2)*100) )
        print('Mean absolute error = %.2f %%' %(np.mean(test_MAE)*100) )
        print('Mean squared error = %.2f %%' %(np.mean(test_MSE)*100) )
        

    return regr