コード例 #1
0
def test_hoeffding_tree_nb(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='nb')

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1
    expected_predictions = array('i', [
        0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3,
        1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3,
        2
    ])

    assert np.alltrue(predictions == expected_predictions)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
コード例 #2
0
ファイル: learner.py プロジェクト: Vini7x/meta_act
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs):
    orig_X = data[:, :-1]
    orig_y = data[:, -1].astype(int)
    stream = DataStream(orig_X, orig_y)
    hf = HoeffdingTreeClassifier(**hf_kwargs)

    pretrainX, pretrainy = stream.next_sample(pre_train_size)

    # Pre-train
    hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values)

    evaluations = []
    while stream.has_more_samples():
        X, y = stream.next_sample()

        # Evaluation
        y_hat = hf.predict(X)
        evaluations.append(int(y_hat[0] == y[0]))

        # Train
        hf.partial_fit(X, y, classes=stream.target_values)

    return evaluations
コード例 #3
0
pi = []
mine_x_mean = []
mine_sum = []
mine_threshold = []
pred_grace_ht = []
pred_grace_ht_p = []
ht_p = None
ML_accuracy = 0

ddm = DDM()
h = hpy()
while elec_stream.has_more_samples():
    n_global += 1

    X_test, y_test = elec_stream.next_sample()
    y_predict = ht.predict(X_test)

    ddm_start_time = time.time()
    ddm.add_element(y_test != y_predict)
    ML_accuracy += 1 if y_test == y_predict else 0
    ddm_running_time = time.time() - ddm_start_time
    RT_ddm.append(ddm_running_time)
    if (n_global > grace_end):
        if (n_global > detect_end):
            if ht_p is not None:
                drift_point = detect_end - 2 * grace
                print("Accuracy of ht: " + str(np.mean(pred_grace_ht)))
                print("Accuracy of ht_p: " + str(np.mean(pred_grace_ht_p)))
                if (np.mean(pred_grace_ht_p) > np.mean(pred_grace_ht)):
                    print("TP detected at: " + str(drift_point))
                    TP_ddm.append(drift_point)
コード例 #4
0
def test_hoeffding_tree_nba(test_path):
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [0, 1, 3, 0, 0, 3, 0, 1, 1, 2,
                                       0, 2, 1, 1, 2, 1, 3, 0, 1, 1,
                                       1, 1, 0, 3, 1, 2, 1, 1, 3, 2,
                                       1, 2, 2, 2, 1, 1, 1, 0, 1, 2,
                                       0, 2, 0, 0, 0, 0, 1, 3, 2])

    test_file = os.path.join(test_path, 'test_hoeffding_tree.npy')

    data = np.load(test_file)

    assert np.alltrue(predictions == expected_predictions)
    assert np.allclose(proba_predictions, data)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nba', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n'

    assert (learner.get_model_description() == expected_model_1)
    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    X, y = stream.next_sample(20000)
    learner.split_criterion = 'hellinger'
    learner.partial_fit(X, y)

    expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \
        'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000' \
        ' | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000' \
        ' | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n'
    assert expected_rules == learner.get_rules_description()
コード例 #5
0
y shape: (len(X) // 3, 1)
For the targets, it has to be a numpy array of single-element integer numpy arrays.
Example y sample: np.array([0.5])
"""

import os
import numpy as np

from skmultiflow.trees import HoeffdingTreeClassifier

num_samples = 30000
samples = np.random.rand(num_samples)
samples = samples.reshape(len(samples) // 3, 1, 3)
targets = np.random.randint(2, size=len(samples))
targets = targets.reshape(len(samples), 1)

ht = HoeffdingTreeClassifier()

correct = 0
max_samples = num_samples // 3
for i in range(max_samples):
    X, y = samples[i], targets[i]
    y_pred = ht.predict(X)
    if y[0] == y_pred[0]:
        correct += 1
    ht = ht.partial_fit(X, y)

    # This is just here so I knew it was working
    # print(f"processed sample: {i}")

print(f"Hoeffding Tree accuracy: {correct / len(samples)}")
コード例 #6
0
class TransitionModel:
    """
    !!! It assumes fully imputed and evenly timed data. !!!

    Data received must be DataFrame with raw measurements and column 'label'. Timestamp must be index of DataFrame and
    it must be in same format as for stream story. Label is an integer denoting state of the measurement. Function fit
    from StateGraph returns data in right format.

    For each feature it calculates average and delta (slope of the least squares linear fit) on last window_size
    measurements. Every time before model is fitted, it is tested on the input.

    Attributes:
        window_size (int): Size of window for rolling average and delta
        history (DataFrame of shape (window_size, n_features)): Last window_size rows of data
        accuracy (float): Accuracy of the model
    """
    def __init__(self, window_size):
        self.window_size = window_size
        self.model = HoeffdingTreeClassifier()
        self.history = None
        self.accuracy = None
        # Number of all predictions and correct predictions for calculating accuracy
        self.predictions = 0
        self.correct_predictions = 0

    def delta(self, y: pd.Series) -> np.float64:
        """ Return slope of least squares linear fit. """
        x = np.arange(len(y))
        return np.polyfit(x, y, 1)[0]

    def check_data(self):
        pass

    def prepare_data(self,
                     data: pd.DataFrame,
                     drop_last_row: bool = True,
                     use_history: bool = True) -> pd.DataFrame:
        """ Take raw data and return data stream with running average and running delta. For the last row there is no
            next state, so drop_last_row should be True for learning, but False for predicting. If use_history is set
            to true function will add history to the data and update history."""
        if use_history:
            data = pd.concat([self.history, data])
            # Update self.history to have last self.window_size measurements
            self.history = data.tail(self.window_size)

        sensor_values = data.drop(columns='label')
        labels = data['label']

        prepared_data = pd.DataFrame()
        for col in sensor_values.columns:
            prepared_data[col + '_mean'] = sensor_values[col].rolling(
                window=self.window_size).mean()
            prepared_data[col + '_delta'] = sensor_values[col].rolling(
                window=self.window_size).apply(self.delta, raw=True)
        prepared_data['current_state'] = labels
        prepared_data['next_state'] = labels.shift(periods=-1, fill_value=-1)

        # Drop last row, because we don't know next state yet
        if drop_last_row:
            prepared_data.drop(prepared_data.tail(1).index, inplace=True)
        prepared_data.drop(prepared_data.head(self.window_size - 1).index,
                           inplace=True)
        return prepared_data

    def partial_fit(self, data: pd.DataFrame) -> None:
        """ The most basic working version for now.
        TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """
        stream = DataStream(self.prepare_data(data))
        n = stream.n_remaining_samples()
        for i in range(n):
            x, y = stream.next_sample()
            if self.model.predict(x)[0] == y[0]:
                self.correct_predictions += 1
            self.model.partial_fit(x, y)
        self.predictions += n
        self.accuracy = self.correct_predictions / self.predictions

    def predict(self,
                data: pd.DataFrame = pd.DataFrame(),
                use_history: bool = True) -> List[int]:
        """ Argument data is a DataFrame with shape (n_samples, n_features).
        use_history tells whether or not history will be included in data before making prediction. If use_history is
        set to true, function will make n_samples+1 predictions, otherwise n_samples-window_size+1 predictions."""
        if not use_history and data.shape[0] < self.window_size:
            raise RuntimeError("Not enough measurements to make a prediction.")
        prepared_data = self.prepare_data(data,
                                          drop_last_row=False,
                                          use_history=use_history)
        prepared_data.drop(columns="next_state", inplace=True)
        return self.model.predict(prepared_data.values)

    def save_model(self):
        pass
コード例 #7
0
class OnlineTrainingServer(
        sea_generator_rpcdesign_pb2_grpc.SEAOnlineTrainingServicer):
    def __init__(self):
        self._model = HoeffdingTreeClassifier()
        self._num_event_accumulation = 0
        self._correct_pred = 0

    def test_connection(self, request, context):
        inputTestingRequestString = request.testString
        print('[gRPC Server] connection testing, request message: ',
              inputTestingRequestString)
        return sea_generator_rpcdesign_pb2.testConnectString(
            testString="Message send back to Client")

    def learn_one(self, request, context):
        self._num_event_accumulation += 1

        input_data_bytes = request.ndarray
        decode_data = np.frombuffer(input_data_bytes)
        dummy_data = np.zeros([1, len(decode_data) - 1])
        dummy_label = np.zeros([1]).astype(np.int64)

        for i in range(len(decode_data) - 1):
            dummy_data[0, i] = decode_data[i]

        dummy_label[0] = decode_data[-1]

        print("check out type")
        print(type(dummy_data[0]))

        prior_y_pred = self._model.predict(dummy_data)
        if prior_y_pred == dummy_label:
            self._correct_pred += 1

        self._model.partial_fit(dummy_data, dummy_label)
        post_y_pred = self._model.predict(dummy_data)
        print(
            "Update model with event: ", str(dummy_data), "Ground Truth:",
            str(dummy_label), "  prior pred: ", str(prior_y_pred),
            "  post pred: ", str(post_y_pred), "  Accuracy: ",
            '{:.2f}'.format(self._correct_pred / self._num_event_accumulation))

        self._model.partial_fit(dummy_data, dummy_label)

        return sea_generator_rpcdesign_pb2.learnOneReply(
            isLearnOneSuccess=True)

    def predict_one(self, request, context):
        """
        prediction rpc, the API provided for client to invoke the model which hold by model sever

        :param request:
        :param context:
        :return: y_pred: int
        """

        input_data_bytes = request.ndarray
        decode_data = np.frombuffer(input_data_bytes)
        dummy_data = np.zeros([1, len(decode_data)])

        for i in range(len(decode_data)):
            dummy_data[0, i] = decode_data[i]

        y_pred = None
        try:
            y_pred = self._model.predict(dummy_data)
        except:
            raise ValueError(
                "Incoming data {} is not fit with model. ".format(y_pred))

        return sea_generator_rpcdesign_pb2.predictOneReply(
            isPredictSuccess=True, y_pred=y_pred)

    def flush_model(self, request, context):

        flush_dir = "../modelPersist/online_hoeffding_tree_persist.pkl"

        request_model_persist_dir = request.modelPersistDir
        if len(request_model_persist_dir) > 0:
            flush_dir = request_model_persist_dir

        is_successful = False
        print("check out the flush directory: ", flush_dir)
        with open(flush_dir, 'wb') as flush_model_processor:
            pickle.dump(self._model, flush_model_processor,
                        pickle.HIGHEST_PROTOCOL)
            is_successful = True

        return sea_generator_rpcdesign_pb2.flushModelReply(
            isSuccess=is_successful)

    def extract_model(self, request, context):

        extract_dir = "../modelPersist/online_hoeffding_tree_persist.pkl"

        request_model_extract_dir = request.modelExtractionDir
        if len(request_model_extract_dir) > 0:
            extract_dir = request_model_extract_dir

        is_successful = False
        print("check out the reading directory: ", extract_dir)
        with open(extract_dir, 'rb') as read_model_processor:
            self._model = pickle.load(read_model_processor)
            is_successful = True

        #return is_successful
        return sea_generator_rpcdesign_pb2.extractModelReply(
            isSuccess=is_successful)