Ejemplo n.º 1
0
def test_sim_crop(input_stream, file_name, crop_size=0):
    adwin = ADWIN()
    change_point = []
    for i in range(len(input_stream)):
        adwin.add_element(input_stream[i])
        if adwin.detected_change():
            # plt.axvline(i, color='r', linestyle='dashed')
            change_point.append(i)

    end_point_crop = change_point[0] + crop_size
    start_point_crop = change_point[0] - 100
    for i in change_point:
        if (i <= end_point_crop):
            plt.axvline(i, color='r', linestyle='dashed')
    crop_stream = input_stream[start_point_crop:end_point_crop]
    zoom_xi = list(range(start_point_crop, end_point_crop))
    plt.plot(zoom_xi, crop_stream)
    plt.ylabel('value')
    plt.xlabel('Time')
    fig = plt.gcf()
    fig.set_size_inches(10, 5.5)
    plt.savefig(os.path.join('image', file_name + "_result_zoom.png"),
                aspect='auto',
                bbox_inches='tight',
                dpi=200)
    plt.show()
    return change_point
Ejemplo n.º 2
0
def perform_drift_detection(predict_dataframe,
                            dataframe,
                            feature_names,
                            detector,
                            drift_notification,
                            token="") -> str:
    log("[INFO] Calling perform_drift_detection", token)
    log("[INFO] Selected data drift detection method: " + detector)
    baseline_data = dataframe.values.tolist()
    predict_data = predict_dataframe.values.tolist()
    overall_data = list()
    for a in baseline_data:
        overall_data.append(a)
    for b in predict_data:
        overall_data.append(b)
    overall_dataframe = pd.DataFrame(overall_data, columns=feature_names)
    drifts = dict()
    window = len(baseline_data)
    for feature in feature_names:
        detected_drifts_indices = list()
        # HDDM
        if detector == "HDDM":
            hddm_w = HDDM_W()
            for i in range(len(overall_dataframe[feature])):
                hddm_w.add_element(float(overall_dataframe[feature][i]))
                if hddm_w.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # Page Hinkley
        if detector == "Page Hinkley":
            ph = PageHinkley()
            for i in range(len(overall_dataframe[feature])):
                ph.add_element(float(overall_dataframe[feature][i]))
                if ph.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # ADWIN
        if detector == "ADWIN":
            adwin = ADWIN()
            for i in range(len(overall_dataframe[feature])):
                adwin.add_element(float(overall_dataframe[feature][i]))
                if adwin.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # Check for detected drifts
        if len(detected_drifts_indices) != 0:
            log("[INFO] Data drift detected in feature: " + feature)
            log("[INFO] The drifted rows are: " + str(detected_drifts_indices))
            drifts[feature] = detected_drifts_indices
            if drift_notification:
                log("[INFO] Sending a web notification", token)
                message = "MaaS data drift detected from " + get_token_user(
                    token) + " (" + token + ")"
                if submit_web_notification(message, token):
                    log("[INFO] Web notification sent!")
                else:
                    log("[ERROR] Error occurred while sending a web notification"
                        )
    return json.dumps(drifts, cls=NpEncoder)
Ejemplo n.º 3
0
def sim_adwin(input_stream, start_point=0):
    adwin = ADWIN(delta=.3)
    change_point = []
    for i in range(len(input_stream)):
        adwin.add_element(input_stream[i])
        if adwin.detected_change():
            # plt.axvline(i, color='r', linestyle='dashed')
            change_point.append(i + start_point)
            # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n')

    return change_point
Ejemplo n.º 4
0
def cp_detection_ADWIN(points):
    from skmultiflow.drift_detection.adwin import ADWIN
    adwin = ADWIN()
    detections = []
    # Adding stream elements to ADWIN and verifying if drift occurred
    for i in range(len(points)):
        adwin.add_element(points[i])
        if adwin.detected_change():
            detections.append(i)
            print('Change detected in data: ' + str(points[i]) +
                  ' - at index: ' + str(i))
    rpt.show.display(points, detections, figsize=(10, 6))
    plt.title('Change Point Detection: ADWIN')
    plt.show()
Ejemplo n.º 5
0
def test_adwin(test_path):
    """
    ADWIN drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.

    """
    adwin = ADWIN()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [1023, 1055, 1087, 1151]
    detected_indices = []

    for i in range(data_stream.size):
        adwin.add_element(data_stream[i])
        if adwin.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices
Ejemplo n.º 6
0
def test_sim(input_stream, file_name):
    adwin = ADWIN()
    change_point = []
    plt.plot(input_stream)

    f = open(os.path.join('results', file_name + ".txt"), "w+")
    for i in range(len(input_stream)):
        adwin.add_element(input_stream[i])
        if adwin.detected_change():
            plt.axvline(i, color='r', linestyle='dashed')
            change_point.append(i)
            # print('Change detected in data: ' + str(input_stream[i]) + ' - at index: ' + str(i)+'\n\n')
            f.write('Change detected in data: ' + str(input_stream[i]) +
                    ' - at index: ' + str(i) + '\n\n')
    f.close()
    plt.ylabel('value')
    plt.xlabel('Time')
    plt.savefig(os.path.join('image', file_name + "_result.png"),
                aspect='auto',
                bbox_inches='tight',
                dpi=200)
    plt.show(aspect='auto')
    plt.show()
Ejemplo n.º 7
0
class AdaActiveLearningNodeRegressor(ActiveLearningNodePerceptron, AdaNode):
    """ Learning Node of the Hoeffding Adaptive Tree regressor.

    Always uses a linear perceptron model to provide predictions.

    Parameters
    ----------
    initial_stats: dict
        In regression tasks this dictionary carries the sufficient to perform
        online variance calculation. They refer to the number of observations
        (key '0'), the sum of the target values (key '1'), and the sum of the
        squared target values (key '2').
    parent_node: AdaLearningNodeForRegression (default=None)
        A node containing statistics about observed data.
    random_state: int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    """
    def __init__(self,
                 initial_stats=None,
                 parent_node=None,
                 random_state=None):
        super().__init__(initial_stats, parent_node, random_state)
        self._adwin = ADWIN()
        self._error_change = False

        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._n = 0

    @property
    def n_leaves(self):
        return 1

    @property
    def error_estimation(self):
        return self._adwin.estimation

    @property
    def error_width(self):
        return self._adwin.width

    def error_is_null(self):
        return self._adwin is None

    def kill_tree_children(self, hat):
        pass

    def learn_one(self, X, y, weight, tree, parent, parent_branch):
        y_pred = self.predict_one(X, tree=tree)
        normalized_error = get_normalized_error(y, y_pred, self)

        if tree.bootstrap_sampling:
            # Perform bootstrap-sampling
            k = self._random_state.poisson(1.0)
            if k > 0:
                weight = weight * k

        if self._adwin is None:
            self._adwin = ADWIN()

        old_error = self.error_estimation

        # Add element to Adwin
        self._adwin.add_element(normalized_error)
        # Detect change with Adwin
        self._error_change = self._adwin.detected_change()

        if self._error_change and old_error > self.error_estimation:
            self._error_change = False

        # Update statistics
        super().learn_one(X, y, weight=weight, tree=tree)

        weight_seen = self.total_weight

        if weight_seen - self.last_split_attempt_at >= tree.grace_period:
            tree._attempt_to_split(self, parent, parent_branch)
            self.last_split_attempt_at = weight_seen

    def predict_one(self, X, *, tree=None):
        prediction_option = tree.leaf_prediction
        if prediction_option == tree._TARGET_MEAN:
            return self._stats[1] / self._stats[0] if len(self._stats) > 0 and self._stats[0] > 0 \
                else 0.0
        else:
            return super().predict_one(X, tree=tree)

    # New for option votes
    def filter_instance_to_leaves(self,
                                  X,
                                  y,
                                  weight,
                                  parent,
                                  parent_branch,
                                  update_splitter_counts,
                                  found_nodes=None):
        if found_nodes is None:
            found_nodes = []
        found_nodes.append(FoundNode(self, parent, parent_branch))
Ejemplo n.º 8
0
class AdaSplitNodeForRegression(SplitNode, AdaNode):
    """ Node that splits the data in a Regression Hoeffding Adaptive Tree.

    Parameters
    ----------
    split_test: skmultiflow.split_test.InstanceConditionalTest
        Split test.
    class_observations: dict
        In regression tasks this dictionary carries the sufficient to perform
        online variance calculation. They refer to the number of observations
        (key '0'), the sum of the target values (key '1'), and the sum of the
        squared target values (key '2').
    """
    def __init__(self, split_test, class_observations):
        super().__init__(split_test, class_observations)
        self._estimation_error_weight = ADWIN()
        self._alternate_tree = None
        self.error_change = False
        self._random_seed = 1
        self._classifier_random = check_random_state(self._random_seed)

    # Override AdaNode
    def number_leaves(self):
        num_of_leaves = 0
        for child in self._children:
            if child is not None:
                num_of_leaves += child.number_leaves()

        return num_of_leaves

    # Override AdaNode
    def get_error_estimation(self):
        return self._estimation_error_weight.estimation

    # Override AdaNode
    def get_error_width(self):
        w = 0.0
        if self.is_null_error() is False:
            w = self._estimation_error_weight.width

        return w

    # Override AdaNode
    def is_null_error(self):
        return self._estimation_error_weight is None

    # Override AdaNode
    def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch):

        true_target = y

        normalized_error = 0.0

        if self.filter_instance_to_leaf(X, parent,
                                        parent_branch).node is not None:
            target_prediction = rhat.predict([X])[0]
            normalized_error = rhat.get_normalized_error(
                target_prediction, true_target)
        if self._estimation_error_weight is None:
            self._estimation_error_weight = ADWIN()

        old_error = self.get_error_estimation()

        # Add element to Change detector
        self._estimation_error_weight.add_element(normalized_error)

        # Detect change
        self.error_change = self._estimation_error_weight.detected_change()

        if self.error_change is True and old_error > self.get_error_estimation(
        ):

            self.error_change = False

        # Check condition to build a new alternate tree
        if self.error_change is True:
            self._alternate_tree = rhat._new_learning_node()
            rhat.alternate_trees_cnt += 1

        # Condition to replace alternate tree
        elif self._alternate_tree is not None and self._alternate_tree.is_null_error(
        ) is False:
            if self.get_error_width() > ERROR_WIDTH_THRESHOLD \
                    and self._alternate_tree.get_error_width() > ERROR_WIDTH_THRESHOLD:
                old_error_rate = self.get_error_estimation()
                alt_error_rate = self._alternate_tree.get_error_estimation()
                fDelta = .05
                fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / (
                    self.get_error_width())

                bound = math.sqrt(2.0 * old_error_rate *
                                  (1.0 - old_error_rate) *
                                  math.log(2.0 / fDelta) * fN)
                # To check, bound never less than (old_error_rate - alt_error_rate)
                if bound < (old_error_rate - alt_error_rate):
                    rhat._active_leaf_node_cnt -= self.number_leaves()
                    rhat._active_leaf_node_cnt += self._alternate_tree.number_leaves(
                    )
                    self.kill_tree_children(rhat)

                    if parent is not None:
                        parent.set_child(parent_branch, self._alternate_tree)
                    else:
                        rhat._tree_root = rhat._tree_root._alternate_tree
                    rhat.switch_alternate_trees_cnt += 1
                elif bound < alt_error_rate - old_error_rate:
                    if isinstance(self._alternate_tree, ActiveLearningNode):
                        self._alternate_tree = None
                    elif isinstance(self._alternate_tree, ActiveLearningNode):
                        self._alternate_tree = None
                    else:
                        self._alternate_tree.kill_tree_children(rhat)
                    rhat.pruned_alternate_trees_cnt += 1  # hat.pruned_alternate_trees_cnt to check

        # Learn_From_Instance alternate Tree and Child nodes
        if self._alternate_tree is not None:
            self._alternate_tree.learn_from_instance(X, y, weight, rhat,
                                                     parent, parent_branch)
        child_branch = self.instance_child_index(X)
        child = self.get_child(child_branch)
        if child is not None:
            child.learn_from_instance(X, y, weight, rhat, self, child_branch)
        # Instance contains a categorical value previously unseen by the split
        # node
        elif isinstance(self.get_split_test(), NominalAttributeMultiwayTest) and \
                self.get_split_test().branch_for_instance(X) < 0:
            # Creates a new learning node to encompass the new observed feature
            # value
            leaf_node = rhat._new_learning_node()
            branch_id = self.get_split_test().add_new_branch(
                X[self.get_split_test().get_atts_test_depends_on()[0]])
            self.set_child(branch_id, leaf_node)
            rhat._active_leaf_node_cnt += 1
            leaf_node.learn_from_instance(X, y, weight, rhat, parent,
                                          parent_branch)

    # Override AdaNode
    def kill_tree_children(self, rhat):
        for child in self._children:
            if child is not None:
                # Delete alternate tree if it exists
                if isinstance(child, rhat.AdaSplitNodeForRegression
                              ) and child._alternate_tree is not None:
                    self._pruned_alternate_trees += 1
                # Recursive delete of SplitNodes
                if isinstance(child, rhat.AdaSplitNodeForRegression):
                    child.kill_tree_children(rhat)

                if isinstance(child, ActiveLearningNode):
                    child = None
                    rhat._active_leaf_node_cnt -= 1
                elif isinstance(child, InactiveLearningNode):
                    child = None
                    rhat._inactive_leaf_node_cnt -= 1

    # override AdaNode
    def filter_instance_to_leaves(self,
                                  X,
                                  y,
                                  weight,
                                  parent,
                                  parent_branch,
                                  update_splitter_counts=False,
                                  found_nodes=None):
        if found_nodes is None:
            found_nodes = []
        if update_splitter_counts:

            try:

                self._observed_class_distribution[0] += weight
                self._observed_class_distribution[1] += y * weight
                self._observed_class_distribution[2] += y * y * weight

            except KeyError:

                self._observed_class_distribution[0] = weight
                self._observed_class_distribution[1] = y * weight
                self._observed_class_distribution[2] = y * y * weight

        child_index = self.instance_child_index(X)
        if child_index >= 0:
            child = self.get_child(child_index)
            if child is not None:
                child.filter_instance_to_leaves(X, y, weight, parent,
                                                parent_branch,
                                                update_splitter_counts,
                                                found_nodes)
            else:
                found_nodes.append(FoundNode(None, self, child_index))
        if self._alternate_tree is not None:
            self._alternate_tree.filter_instance_to_leaves(
                X, y, weight, self, -999, update_splitter_counts, found_nodes)
    class AdaLearningNodeForRegression(LearningNodePerceptron, NewNode):
        def __init__(self,
                     initial_class_observations,
                     perceptron_weight,
                     random_state=None):
            super().__init__(initial_class_observations, perceptron_weight,
                             random_state)
            self._estimation_error_weight = ADWIN()
            self._error_change = False
            self._randomSeed = 1
            self._classifier_random = check_random_state(self._randomSeed)

        def calc_byte_size(self):
            byte_size = self.__sizeof__()
            if self._estimation_error_weight is not None:
                byte_size += self._estimation_error_weight.get_length_estimation(
                )
            return byte_size

        # Override NewNode
        def number_leaves(self):
            return 1

        # Override NewNode
        def get_error_estimation(self):
            return self._estimation_error_weight.estimation

        # Override NewNode
        def get_error_width(self):
            return self._estimation_error_weight.width

        # Override NewNode
        def is_null_error(self):
            return self._estimation_error_weight is None

        def kill_tree_children(self, hat):
            pass

        # Override NewNode
        def learn_from_instance(self, X, y, weight, rhat, parent,
                                parent_branch):

            super().learn_from_instance(X, y, weight, rhat)

            true_target = y
            target_prediction = rhat.predict([X])[0]

            normalized_error = rhat.get_normalized_error(
                target_prediction, true_target)

            if self._estimation_error_weight is None:
                self._estimation_error_weight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to Adwin

            self._estimation_error_weight.add_element(normalized_error)
            # Detect change with Adwin
            self._error_change = self._estimation_error_weight.detected_change(
            )

            if self._error_change is True and old_error > self.get_error_estimation(
            ):
                self._error_change = False

            # call ActiveLearningNode
            weight_seen = self.get_weight_seen()

            if weight_seen - self.get_weight_seen_at_last_split_evaluation(
            ) >= rhat.grace_period:
                rhat._attempt_to_split(self, parent, parent_branch)
                self.set_weight_seen_at_last_split_evaluation(weight_seen)

        # Override NewNode, New for option votes
        def filter_instance_to_leaves(self,
                                      X,
                                      y,
                                      weight,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []
            found_nodes.append(
                HoeffdingTree.FoundNode(self, parent, parent_branch))
    end_test_date = start_test_date + pd.DateOffset(months = 3)

    X_train, y_train, X_test, y_test = xgboost_model.generate_data(data, start_train_date, end_train_date,
                                                                       start_test_date, end_test_date, verbose=1)

    if training_flag:
        xgboost_model.fit_model(X_train, y_train)

    results_dict = xgboost_model.compute_predictions(X_test, y_test)

    label_transformed = transform_label(results_dict['y_true'][-1], results_dict['Predictions'][-1])

    temp_drifts = []

    for i in range(label_transformed.shape[0]):
        adwin.add_element(label_transformed['conversion'].iloc[i])
        if adwin.detected_change():
            print('Change detected ADWIN in data: ' + str(results_dict['y_true'][-1].iloc[i]) + ' - at date: ' + str(results_dict['Date'][-1].iloc[i]))
            temp_drifts.append(results_dict['Date'][-1].iloc[i])
            adwin.reset()


    if not temp_drifts:
        print('No Drift Detected - Predict next three months')
        start_test_date = start_test_date + pd.DateOffset(months = 3)
        training_flag = False

    if temp_drifts:
        print('Drift detected - Retrain model')
        list_drift.append(temp_drifts[0])
        start_train_date = temp_drifts[0] - pd.DateOffset(years = 2)
# Simulate a data stream of size 1000 from a Standard normal distribution
stream = np.random.randn(1000)

stream[:10]
# Output:
#array([-1.0856306 ,  0.99734545,  0.2829785 , -1.50629471, -0.57860025,
#        1.65143654, -2.42667924, -0.42891263,  1.26593626, -0.8667404 ])

# Data concept are changed from index 599 to 999
for j in range(599, 1000):
    stream[j] = np.random.randint(5, high=9)

# Stream elements are added to ADWIN and checking whether drift occured
for j in range(1000):
    A.add_element(stream[j])
    if A.detected_change():
        print('Concept Drift detected in data: ' + str(stream[j]) +
              ' - at index: ' + str(j))
### Output:
#Concept Drift detected in data: 8.0 - at index: 607
#Concept Drift detected in data: 5.0 - at index: 639
#Concept Drift detected in data: 6.0 - at index: 671

########

### DDM code
import numpy as np
from skmultiflow.drift_detection import DDM

# call the DDM object
Ejemplo n.º 12
0
class AdaSplitNodeRegressor(AdaSplitNode):
    """ Node that splits the data in a Hoeffding Adaptive Tree regressor.

    Parameters
    ----------
    split_test: skmultiflow.split_test.InstanceConditionalTest
        Split test.
    stats: dict
        In regression tasks this dictionary carries the sufficient to perform
        online variance calculation. They refer to the number of observations
        (key '0'), the sum of the target values (key '1'), and the sum of the
        squared target values (key '2').
    random_state: int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    """
    def __init__(self, split_test, stats=None, random_state=None):
        super().__init__(split_test, stats, random_state)
        # Normalization of info monitored by drift detectors (using Welford's algorithm)
        self._n = 0

    # Override AdaSplitNode
    def learn_one(self, X, y, weight, tree, parent, parent_branch):
        normalized_error = 0.0

        leaf = self.filter_instance_to_leaf(X, parent, parent_branch).node
        if leaf is not None:
            y_pred = leaf.predict_one(X, tree=tree)
            normalized_error = get_normalized_error(y, y_pred, self)
        if self._adwin is None:
            self._adwin = ADWIN()

        old_error = self.error_estimation

        # Add element to change detector
        self._adwin.add_element(normalized_error)

        # Detect change
        self.error_change = self._adwin.detected_change()

        if self.error_change and old_error > self.error_estimation:
            self.error_change = False

        # Check condition to build a new alternate tree
        if self.error_change:
            self._alternate_tree = tree._new_learning_node()
            tree.alternate_trees_cnt += 1

        # Condition to replace alternate tree
        elif self._alternate_tree is not None and not self._alternate_tree.error_is_null(
        ):
            if self.error_width > tree._ERROR_WIDTH_THRESHOLD \
                    and self._alternate_tree.error_width > tree._ERROR_WIDTH_THRESHOLD:
                old_error_rate = self.error_estimation
                alt_error_rate = self._alternate_tree.error_estimation
                fDelta = .05
                fN = 1.0 / self._alternate_tree.error_width + 1.0 / self.error_width

                sq_term = 2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) \
                    * fN
                bound = math.sqrt(sq_term) if sq_term > 0 else 0.0

                if bound < (old_error_rate - alt_error_rate):
                    tree._active_leaf_node_cnt -= self.n_leaves
                    tree._active_leaf_node_cnt += self._alternate_tree.n_leaves
                    self.kill_tree_children(tree)

                    if parent is not None:
                        parent.set_child(parent_branch, self._alternate_tree)
                    else:
                        tree._tree_root = tree._tree_root._alternate_tree
                    tree.switch_alternate_trees_cnt += 1
                elif bound < alt_error_rate - old_error_rate:
                    if isinstance(self._alternate_tree, ActiveLeaf):
                        self._alternate_tree = None
                    elif isinstance(self._alternate_tree, InactiveLeaf):
                        self._alternate_tree = None
                    else:
                        self._alternate_tree.kill_tree_children(tree)
                    tree.pruned_alternate_trees_cnt += 1  # hat.pruned_alternate_trees_cnt to check

        # Learn one sample in alternate tree and child nodes
        if self._alternate_tree is not None:
            self._alternate_tree.learn_one(X, y, weight, tree, parent,
                                           parent_branch)
        child_branch = self.instance_child_index(X)
        child = self.get_child(child_branch)

        if child is not None:
            child.learn_one(X,
                            y,
                            weight,
                            tree,
                            parent=self,
                            parent_branch=child_branch)
        # Instance contains a categorical value previously unseen by the split node
        else:
            # Creates a new learning node to encompass the new observed feature
            # value
            leaf_node = tree._new_learning_node()
            branch_id = self.split_test.add_new_branch(
                X[self.split_test.get_atts_test_depends_on()[0]])
            self.set_child(branch_id, leaf_node)
            tree._active_leaf_node_cnt += 1
            leaf_node.learn_one(X, y, weight, tree, parent, parent_branch)

    def predict_one(self, X, *, tree=None):
        # Called in case an emerging categorical feature has no path down the split node to be
        # sorted
        return self.stats[1] / self.stats[0] if len(self.stats) > 0 else 0.0

    # override AdaNode
    def filter_instance_to_leaves(self,
                                  X,
                                  y,
                                  weight,
                                  parent,
                                  parent_branch,
                                  update_splitter_counts=False,
                                  found_nodes=None):
        if found_nodes is None:
            found_nodes = []
        if update_splitter_counts:
            try:
                self._stats[0] += weight
                self._stats[1] += y * weight
                self._stats[2] += y * y * weight
            except KeyError:
                self._stats[0] = weight
                self._stats[1] = y * weight
                self._stats[2] = y * y * weight

        child_index = self.instance_child_index(X)
        if child_index >= 0:
            child = self.get_child(child_index)
            if child is not None:
                child.filter_instance_to_leaves(X, y, weight, parent,
                                                parent_branch,
                                                update_splitter_counts,
                                                found_nodes)
            else:
                found_nodes.append(FoundNode(None, self, child_index))
        if self._alternate_tree is not None:
            self._alternate_tree.filter_instance_to_leaves(
                X, y, weight, self, -999, update_splitter_counts, found_nodes)
Ejemplo n.º 13
0
class SADWINIsolationForestStream(BaseSKMObject, ClassifierMixin):
    """
  This code implements  Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window (Ding \& Fei, 2013) [3]

    Each sample has an anomaly score is computed based on Isolation Forest anomaly based approach [2]. The concept of Isolation forest [1]
    consists on  isolating observations by randomly selecting a feature
    and then randomly selecting a split value between the maximum and minimum
    values of the selected feature.
    
    Model is updated of a Drift has been detected based on a input drift threshold. The drift detection approach is proposed by [2] 
    and works as follow : if the averaged anomaly score between two successive sliding windows is highter than the drift threshold (u), 
    then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream.


  Parameters

    ---------

    n_estimators: int, optional (default=25)

       Number of trees in the ensemble.

       't' in the original paper.



    window_size: int, optional (default=100)

        The window size of the stream.

        ψ, 'Psi' in the original paper.   

## Optional       

    anomaly_threshold: double, optional (default=0.5)

        The threshold for declaring anomalies.

        Any instance prediction probability above this threshold will be declared as an anomaly.

    drift_threshold: double, optional (default=0.5)

        The threshold for detecting Drift and update the model.

       If the averaged anomaly score between two successive sliding windows is highter than the threshold (u), 
    then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream.
    This parameters is supposed to be know by an expert domain, depending on data set.

## Other Attributes

    ensemble : Isolation Tree Ensemble

        Contain an Isolation Tree Ensemble object, current model for   IsolationForestStream

    sample_size : int

        Number of sample seen since the update

    anomaly_rate : float

        Rate of the anomalies in the previous sliding window (AnomalyRate in the original paper iForestASD)

    prec_window & window : numpy.ndarray of shape (n_samples, self.window_size)

        The previous and current window of data

    cpt : int

        Counter, if the n_estimator is higher than its, it will fit

    References
    ----------

    [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua.        
“Isolation forest.” Data Mining, 2008. ICDM’08. Eighth IEEE International Conference on.

    [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation-based anomaly detection.” ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 
self.n_estimators

    [3] Ding, Zhiguo. (2013) An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window. 12-17. 10.3182/20130902-3-CN-3020.00044. 

    """
    def __init__(
            self,
            window_size=100,
            n_estimators=25,
            anomaly_threshold=0.5,
            drift_threshold=0.5,
            random_state=None,
            version="AnomalyRate",
            #Parameters for partial model update
            n_estimators_updated=0.5,
            updated_randomly=True,
            #Parameters for NDKSWIN
            alpha=0.01,
            data=None,
            n_dimensions=1,
            n_tested_samples=0.1,
            fixed_checked_dimension=False,
            fixed_checked_sample=False):

        super().__init__()

        self.n_estimators = n_estimators

        self.ensemble = None

        self.random_state = random_state

        self.window_size = window_size

        self.samples_seen = 0

        self.anomaly_rate = 0.20

        self.anomaly_threshold = anomaly_threshold

        self.drift_threshold = drift_threshold

        self.window = None

        self.prec_window = None

        self.cpt = 0
        self.version = version
        self.model_update = [
        ]  #To count the number of times the model have been updated 0 Not updated and 1 updated
        self.model_update_windows = [
        ]  #To count the number of times the model have been updated 0 Not updated and 1 updated
        self.model_update.append(
            version
        )  #Initialisation to know the concerned version of IForestASD
        self.model_update_windows.append(
            "samples_seen_" + version
        )  #Initialisation to know the number of data seen in the window
        self.n_estimators_updated = int(
            self.n_estimators * n_estimators_updated
        )  # The percentage of new trees to compute when update on new window
        if n_estimators_updated <= 0.0 or n_estimators_updated > 1.0:
            raise ValueError("n_estimators_updated must be > 0 and <= 1")

        self.updated_randomly = updated_randomly  # If we will choose randomly the trees: True for randomly,
        # False to pick the first (n_estimators- int(n_estimators*n_estimators_updated)) trees

        self.alpha = alpha
        self.n_dimensions = n_dimensions
        self.n_tested_samples = n_tested_samples
        self.fixed_checked_dimension = fixed_checked_dimension
        self.fixed_checked_sample = fixed_checked_sample
        self.first_time_fit = True

        # TODO Maurras 27112020: Find a way to optimize the use of ADWIN()
        self.adwin = ADWIN()

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """ Partially (incrementally) fit the model.
          Parameters
          ----------
          X : numpy.ndarray of shape (n_samples, n_features)
              The features to train the model.
          y: numpy.ndarray of shape (n_samples)
              An array-like with the class labels of all samples in X.
          classes: None
              Not used by this method.
          sample_weight: None
              Not used by this method.
          Returns
          -------
          self
          """

        ## get the number of observations
        number_instances, _ = X.shape

        if (self.samples_seen == 0):
            ## ToDo ? Give a sample of self.window_size in attribute of iForest
            iforest = IsolationTreeEnsemble(self.window_size,
                                            self.n_estimators,
                                            self.random_state)
            self.ensemble = iforest

        for i in range(number_instances):
            self._partial_fit(X[i], y[i])

        return self

    def _partial_fit(self, X, y):
        """ Trains the model on samples X and corresponding targets y.
          Private function where actual training is carried on.
          Parameters
          ----------
          X: numpy.ndarray of shape (1, n_features)
              Instance attributes.
          y: int
              Class label for sample X. Not used in this implementaion which is Unsupervised
          """
        """
          Reshape X and add it to our window if it isn't full.
          If it's full, give window to our precedent_window.
          If we are at the end our window, fit if we're learning 
          Check the anomaly score of our window 
          Update if self.anomaly_rate > self.drift_threshold

          """
        X = np.reshape(X, (1, len(X)))

        if self.samples_seen % self.window_size == 0:
            ## Update the two windows (precedent one and current windows)
            self.prec_window = self.window
            self.window = X
        else:
            self.window = np.concatenate((self.window, X))

        if self.samples_seen % self.window_size == 0 and self.samples_seen != 0:

            #Fit the ensemble if it's not empty
            #if(self.cpt<self.n_estimators):
            #  self.ensemble.fit(self.prec_window)
            #  self.cpt += 1
            if self.first_time_fit:  #It is the first window
                self.ensemble.fit(self.prec_window)
                self.first_time_fit = False

            elif (self.version == "SADWIN"):
                #if self.first_time_fit:
                #    from skmultiflow.drift_detection.adwin import ADWIN
                #    adwin = ADWIN()
                #    self.first_time_fit = False
                #print('start sadwin version')
                #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow
                #from skmultiflow.drift_detection.adwin import ADWIN
                #adwin = ADWIN()
                prec_window_scores = self.ensemble.anomaly_score(
                    self.prec_window)
                #print('Before  add element to adwin in SADWIN')
                #print(prec_window_scores)
                drift_detected = False
                #ind = 0
                for score in prec_window_scores:
                    #adwin.add_element(prec_window_scores)
                    #print("added score = "+ str(score) + " on index = "+ str(ind))
                    #print('score[0]')
                    #print(score[0])
                    #print('score')
                    #print(score)
                    self.adwin.add_element(score[0])
                    #print('start change detection')
                    if self.adwin.detected_change():
                        #print('Change detected SADWIN')
                        drift_detected = True
                        #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i]))
                        break
                    #ind = ind + 1
                if (drift_detected):
                    #print('start model updating')
                    self.model_update.append(1)
                    self.model_update_windows.append(self.samples_seen)
                    self.update_model(self.prec_window)
                    self.adwin.reset()
                else:
                    self.model_update.append(0)
                    self.model_update_windows.append(self.samples_seen)

        self.samples_seen += 1

    def update_model(self, window):
        """ Update the model (fit a new isolation forest) if the current anomaly rate (in the previous sliding window)
     is higher than self.drift_threshold
        Parameters: 
          window: numpy.ndarray of shape (self.window_size, n_features)
        Re-Initialize our attributes and our ensemble, fit with the current window

    """

        ## ToDo ? Give a sample of self.window_size in attribute of iForest
        #MAJ Maurras 03112020 : No, Leave it like that. Must give all the window to tt construct the forest of itrees.
        self.is_learning_phase_on = True
        iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators,
                                        self.random_state)
        self.ensemble = iforest
        self.ensemble.fit(window)
        #self.nb_update = self.nb_update + 1
        print("")
        print(
            "The model was updated by training a new iForest with the version : "
            + self.version)

    def anomaly_scores_rate(self, window):
        """
    Given a 2D matrix of observations, compute the anomaly rate 
    for all instances in the window and return an anomaly rate of the given window.

    Parameters :
    window: numpy.ndarray of shape (self.window_size, n_features)
    """

        score_tab = 2.0**(-1.0 * self.ensemble.path_length(window) /
                          c(len(window)))
        score = 0
        for x in score_tab:
            if x > self.anomaly_threshold:
                score += 1
        return score / len(score_tab)

    '''
      MAJ : 21112020
      By : Maurras
      Add new function to classify instances (anomaly or normal)
  '''

    def predict_simple(self, X):
        """
    Given a window, Predict the instance class (1 or 0) by using predict_from_instances_scores on our model

    """
        #print('predict_simple')
        prediction = self.ensemble.predict_from_instances_scores(
            self.ensemble.anomaly_score(X),
            self.anomaly_threshold)  ## return prediction of all instances

        #print('end predict_simple')
        return prediction

    def predict(self, X):
        """
    Given an instance, Predict the anomaly (1 or 0) based on the last sample of the window by using predict_proba if our model have fit, 
    else return None

    """
        if (self.samples_seen <= self.window_size):

            return [-1]  ## Return the last element

        X = np.reshape(X, (1, len(X[0])))
        self.prec_window = np.concatenate(
            (self.prec_window,
             X))  ## Append the instances in the sliding window

        prediction = self.ensemble.predict_from_anomaly_scores(
            self.predict_proba(self.prec_window),
            self.anomaly_threshold)  ## return 0 or 1

        return [prediction]

    def predict_proba(self, X):
        """
    Calculate the anomaly score of the window if our model have fit, else return None
    Parameters :
    X: numpy.ndarray of shape (self.window_size, n_features)   

    """
        if (self.samples_seen <= self.window_size):
            return [-1]
        return self.ensemble.anomaly_score(
            self.prec_window
        )[-1]  # Anomaly return an array with all scores of each data, taking -1 return the last instance (X) anomaly score
    class AdaSplitNode(SplitNode, NewNode):
        def __init__(self, split_test, class_observations):
            super().__init__(split_test, class_observations)
            self._estimation_error_weight = ADWIN()
            self._alternate_tree = None
            self.error_change = False
            self._random_seed = 1
            self._classifier_random = check_random_state(self._random_seed)

        # Override NewNode
        def number_leaves(self):
            num_of_leaves = 0
            for child in self._children:
                if child is not None:
                    num_of_leaves += child.number_leaves()

            return num_of_leaves

        # Override NewNode
        def get_error_estimation(self):
            return self._estimation_error_weight.estimation

        # Override NewNode
        def get_error_width(self):
            w = 0.0
            if self.is_null_error() is False:
                w = self._estimation_error_weight.width

            return w

        # Override NewNode
        def is_null_error(self):
            return self._estimation_error_weight is None

        # Override NewNode
        def learn_from_instance(self, X, y, weight, hat, parent,
                                parent_branch):
            true_class = y
            class_prediction = 0

            leaf = self.filter_instance_to_leaf(X, parent, parent_branch)
            if leaf.node is not None:
                class_prediction = get_max_value_key(
                    leaf.node.get_class_votes(X, hat))

            bl_correct = (true_class == class_prediction)

            if self._estimation_error_weight is None:
                self._estimation_error_weight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to ADWIN
            add = 0.0 if (bl_correct is True) else 1.0

            self._estimation_error_weight.add_element(add)
            # Detect change with ADWIN
            self.error_change = self._estimation_error_weight.detected_change()

            if self.error_change is True and old_error > self.get_error_estimation(
            ):
                self.error_change = False

            # Check condition to build a new alternate tree
            if self.error_change is True:
                self._alternate_tree = hat._new_learning_node()
                hat.alternate_trees_cnt += 1

            # Condition to replace alternate tree
            elif self._alternate_tree is not None and self._alternate_tree.is_null_error(
            ) is False:
                if self.get_error_width() > error_width_threshold \
                        and self._alternate_tree.get_error_width() > error_width_threshold:
                    old_error_rate = self.get_error_estimation()
                    alt_error_rate = self._alternate_tree.get_error_estimation(
                    )
                    fDelta = .05
                    fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / (
                        self.get_error_width())

                    bound = math.sqrt(2.0 * old_error_rate *
                                      (1.0 - old_error_rate) *
                                      math.log(2.0 / fDelta) * fN)
                    # To check, bound never less than (old_error_rate - alt_error_rate)
                    if bound < (old_error_rate - alt_error_rate):
                        hat._active_leaf_node_cnt -= self.number_leaves()
                        hat._active_leaf_node_cnt += self._alternate_tree.number_leaves(
                        )
                        self.kill_tree_children(hat)

                        if parent is not None:
                            parent.set_child(parent_branch,
                                             self._alternate_tree)
                        else:
                            # Switch tree root
                            hat._tree_root = hat._tree_root.alternateTree
                        hat.switch_alternate_trees_cnt += 1
                    elif bound < alt_error_rate - old_error_rate:
                        if isinstance(
                                self._alternate_tree,
                                GRF_HoeffdingAdaptiveTree.ActiveLearningNode):
                            self._alternate_tree = None
                        elif isinstance(
                                self._alternate_tree,
                                GRF_HoeffdingAdaptiveTree.InactiveLearningNode
                        ):
                            self._alternate_tree = None
                        else:
                            self._alternate_tree.kill_tree_children(hat)
                        hat.pruned_alternate_trees_cnt += 1  # hat.pruned_alternate_trees_cnt to check

            # Learn_From_Instance alternate Tree and Child nodes
            if self._alternate_tree is not None:
                self._alternate_tree.learn_from_instance(
                    X, y, weight, hat, parent, parent_branch)
            child_branch = self.instance_child_index(X)
            child = self.get_child(child_branch)
            if child is not None:
                child.learn_from_instance(X, y, weight, hat, parent,
                                          parent_branch)

        # Override NewNode
        def kill_tree_children(self, hat):
            for child in self._children:
                if child is not None:
                    # Delete alternate tree if it exists
                    if isinstance(child, GRF_HoeffdingAdaptiveTree.AdaSplitNode
                                  ) and child._alternate_tree is not None:
                        child._alternate_tree.kill_tree_children(hat)
                        self._pruned_alternate_trees += 1
                    # Recursive delete of SplitNodes
                    if isinstance(child,
                                  GRF_HoeffdingAdaptiveTree.AdaSplitNode):
                        child.kill_tree_children(hat)

                    if isinstance(
                            child,
                            GRF_HoeffdingAdaptiveTree.ActiveLearningNode):
                        child = None
                        hat._active_leaf_node_cnt -= 1
                    elif isinstance(
                            child,
                            GRF_HoeffdingAdaptiveTree.InactiveLearningNode):
                        child = None
                        hat._inactive_leaf_node_cnt -= 1

        # override NewNode
        def filter_instance_to_leaves(self,
                                      X,
                                      y,
                                      weight,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts=False,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []
            if update_splitter_counts:
                try:
                    self._observed_class_distribution[
                        y] += weight  # Dictionary (class_value, weight)
                except KeyError:
                    self._observed_class_distribution[y] = weight
            child_index = self.instance_child_index(X)
            if child_index >= 0:
                child = self.get_child(child_index)
                if child is not None:
                    child.filter_instance_to_leaves(X, y, weight, parent,
                                                    parent_branch,
                                                    update_splitter_counts,
                                                    found_nodes)
                else:
                    found_nodes.append(
                        HoeffdingTree.FoundNode(None, self, child_index))
            if self._alternate_tree is not None:
                self._alternate_tree.filter_instance_to_leaves(
                    X, y, weight, self, -999, update_splitter_counts,
                    found_nodes)
Ejemplo n.º 15
0
# Purpose: Detecting concept drifts of case execution time to decide the period of training set.

import pandas as pd
from matplotlib import pyplot as plt
from skmultiflow.drift_detection.adwin import ADWIN

df = pd.read_csv('data/bpic2012_cet.csv')

# drift detection
adwin = ADWIN()
drift_ind = []

for idx, row in df.iterrows():
    cet = row['case_execution_time_seconds']
    adwin.add_element(cet)
    if adwin.detected_change():
        print('Change detected in data: ' + str(cet) + ' - at index: ' +
              str(idx))
        drift_ind.append(idx)

plt.plot(df['case_execution_time_seconds'])
for i in drift_ind:
    plt.axvline(i, color='black', linestyle='--', linewidth=1)

plt.show()
class AdaLearningNodeForRegression(ActiveLearningNodePerceptron, AdaNode):
    """ Learning Node of the Regression Hoeffding Adaptive Tree that always use
    a linear perceptron model to provide responses.

    Parameters
    ----------
    initial_class_observations: dict
        In regression tasks this dictionary carries the sufficient to perform
        online variance calculation. They refer to the number of observations
        (key '0'), the sum of the target values (key '1'), and the sum of the
        squared target values (key '2').
    parent_node: AdaLearningNodeForRegression (default=None)
        A node containing statistics about observed data.
    random_state: int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    """
    def __init__(self,
                 initial_class_observations,
                 parent_node,
                 random_state=None):
        super().__init__(initial_class_observations, parent_node, random_state)
        self._estimation_error_weight = ADWIN()
        self._error_change = False

        # To normalize the observed errors in the [0, 1] range
        self._min_error = float('Inf')
        self._max_error = float('-Inf')

    # Override AdaNode
    def number_leaves(self):
        return 1

    # Override AdaNode
    def get_error_estimation(self):
        return self._estimation_error_weight.estimation

    # Override AdaNode
    def get_error_width(self):
        return self._estimation_error_weight.width

    # Override AdaNode
    def is_null_error(self):
        return self._estimation_error_weight is None

    def kill_tree_children(self, hat):
        pass

    # Override AdaNode
    def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch):

        super().learn_from_instance(X, y, weight, rhat)

        y_pred = rhat.predict([X])[0]
        normalized_error = self.get_normalized_error(y, y_pred)

        if self._estimation_error_weight is None:
            self._estimation_error_weight = ADWIN()

        old_error = self.get_error_estimation()

        # Add element to Adwin

        self._estimation_error_weight.add_element(normalized_error)
        # Detect change with Adwin
        self._error_change = self._estimation_error_weight.detected_change()

        if self._error_change and old_error > self.get_error_estimation():
            self._error_change = False

        # call ActiveLearningNode
        weight_seen = self.get_weight_seen()

        if weight_seen - self.get_weight_seen_at_last_split_evaluation(
        ) >= rhat.grace_period:
            rhat._attempt_to_split(self, parent, parent_branch)
            self.set_weight_seen_at_last_split_evaluation(weight_seen)

    # Override AdaNode, New for option votes
    def filter_instance_to_leaves(self,
                                  X,
                                  y,
                                  weight,
                                  parent,
                                  parent_branch,
                                  update_splitter_counts,
                                  found_nodes=None):
        if found_nodes is None:
            found_nodes = []
        found_nodes.append(FoundNode(self, parent, parent_branch))

    def get_normalized_error(self, y, y_pred):
        abs_error = abs(y - y_pred)

        # Incremental maintenance of the normalization ranges
        if abs_error < self._min_error:
            self._min_error = abs_error
        if abs_error > self._max_error:
            self._max_error = abs_error

        if self._min_error != self._max_error:
            return (abs_error - self._min_error) / (self._max_error -
                                                    self._min_error)
        else:
            return 0.0
Ejemplo n.º 17
0


eda_discretizado = discretizar(ppa_result)

plt.plot(eda_discretizado,'k-',color='blue', label="Discretizado (SAX)")
plt.title("Señal EDA  (Discretizado)")
plt.xlabel("Tiempo (Minutos)")
plt.ylabel("µS")
plt.legend(loc="upper right")
#plt.show()

#4) Deteccion de cambio (clasificacion)
from skmultiflow.drift_detection.adwin import ADWIN
adwin = ADWIN(delta = 0.01)
cambios_detectados_x = []
cambios_detectados_y = []
for i in range(len(eda_discretizado)):
    adwin.add_element(eda_discretizado[i])
    if adwin.detected_change():
        print('Change detected in data: ' + str(eda_discretizado[i]) + ' - at index: ' + str(i))
        cambios_detectados_x.append(i)
        cambios_detectados_y.append(eda_discretizado[i])


plt.plot(cambios_detectados_x,cambios_detectados_y, 'x', label="Cambio detectado" )
plt.legend(loc="upper right")
plt.show()


        # Add date of updating to update_dates
        xgboost_model.results['Update'].append(end_train_date)

    results_dict = xgboost_model.compute_predictions(X_test, y_test)

    temp_drifts = []

    df_results = pd.DataFrame({
        'y_true': results_dict['y_true'][-1],
        'y_pred': results_dict['Predictions'][-1]
    })
    df_results['Correct'] = (df_results['y_true'] == df_results['y_pred'])

    for i in range(df_results.shape[0]):
        adwin.add_element(df_results['Correct'].iloc[i])
        if adwin.detected_change():
            print('Change detected ADWIN in data: ' +
                  str(df_results['Correct'].iloc[i]) + ' - at date: ' +
                  str(results_dict['Date'][-1].iloc[i]))
            temp_drifts.append(results_dict['Date'][-1].iloc[i])
            adwin.reset()

    if not temp_drifts:
        print('No Drift Detected - Predict next three months')
        start_test_date = start_test_date + pd.DateOffset(months=3)
        training_flag = False
        update_flag = False

    if temp_drifts:
        print('Drift detected - Choice on model')
Ejemplo n.º 19
0
class KNNAdwin(KNN):
    """ K-Nearest Neighbors Classifier with ADWIN Change detector 
    
    This Classifier is an improvement from the regular KNN classifier, 
    as it is resistant to concept drift. It utilises the ADWIN change 
    detector to decide which samples to keep and which ones to forget, 
    and by doing so it regulates the sample window size.
     
    To know more about the ADWIN change detector, please visit 
    skmultiflow.classification.core.drift_detection.adwin

    It uses the regular KNN Classifier as a base class, with the 
    major difference that this class keeps a variable size window, 
    instead of a fixed size one and also it updates the adwin algorithm 
    at each partial_fit call.
    
    Parameters
    ----------
    k: int
        The number of nearest neighbors to search for.
        
    max_window_size: int
        The maximum size of the window storing the last viewed samples.
        
    leaf_size: int
        The maximum number of samples that can be stored in one leaf node, 
        which determines from which point the algorithm will switch for a 
        brute-force approach. The bigger this number the faster the tree 
        construction time, but the slower the query time will be.
        
    categorical_list: An array-like
        Each entry is the index of a categorical feature. May be requested 
        further filtering.
        
    Raises
    ------
    NotImplementedError: A few of the functions described here are not 
    implemented since they have no application in this context.
    
    ValueError: A ValueError is raised if the predict function is called 
    before at least k samples have been analyzed by the algorithm.
    
    Examples
    --------
    >>> # Imports
    >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin
    >>> from skmultiflow.classification.lazy.knn import KNN
    >>> from skmultiflow.data.file_stream import FileStream
    >>> # Setting up the stream
    >>> stream = FileStream('skmultiflow/datasets/covtype.csv', -1, 1)
    >>> stream.prepare_for_use()
    >>> # Setting up the KNNAdwin classifier
    >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000)
    >>> # Pre training the classifier with 200 samples
    >>> X, y = stream.next_sample(200)
    >>> knn_adwin = knn_adwin.partial_fit(X, y)
    >>> # Keeping track of sample count and correct prediction count
    >>> n_samples = 0
    >>> corrects = 0
    >>> while n_samples < 5000:
    ...     X, y = stream.next_sample()
    ...     pred = knn_adwin.predict(X)
    ...     if y[0] == pred[0]:
    ...         corrects += 1
    ...     knn_adwin = knn_adwin.partial_fit(X, y)
    ...     n_samples += 1
    >>>
    >>> # Displaying the results
    >>> print('KNN usage example')
    >>> print(str(n_samples) + ' samples analyzed.')
    5000 samples analyzed.
    >>> print("KNNAdwin's performance: " + str(corrects/n_samples))
    KNNAdwin's performance: 0.7798

    """
    def __init__(self,
                 k=5,
                 max_window_size=sys.maxsize,
                 leaf_size=30,
                 categorical_list=[]):
        super().__init__(k=k,
                         max_window_size=max_window_size,
                         leaf_size=leaf_size,
                         categorical_list=categorical_list)
        self.adwin = ADWIN()
        self.window = None

    def reset(self):
        """ reset
        
        Resets the adwin algorithm as well as the base model 
        kept by the KNN base class.
        
        Returns
        -------
        KNNAdwin
            self
        
        """
        self.adwin = ADWIN()
        return super().reset()

    def partial_fit(self, X, y, classes=None, weight=None):
        """ partial_fit
        
        Partially fits the model. This is done by updating the window 
        with new samples while also updating the adwin algorithm. Then 
        we verify if a change was detected, and if so, the window is 
        correctly split at the drift moment.
        
        Parameters
        ----------
        X: Numpy.ndarray of shape (n_samples, n_features)
            The data upon which the algorithm will create its model.
            
        y: Array-like
            An array-like containing the classification targets for all 
            samples in X.
            
        classes: Not used.

        weight: Not used.
        
        Returns
        -------
        KNNAdwin
            self
        
        """
        r, c = get_dimensions(X)
        if self.window is None:
            self.window = InstanceWindow(max_size=self.max_window_size)

        for i in range(r):
            if r > 1:
                self.window.add_element(np.asarray([X[i]]),
                                        np.asarray([[y[i]]]))
            else:
                self.window.add_element(np.asarray([X[i]]),
                                        np.asarray([[y[i]]]))
            if self.window._num_samples >= self.k:
                add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0
                self.adwin.add_element(add)
            else:
                self.adwin.add_element(0)

        if self.window._num_samples >= self.k:
            changed = self.adwin.detected_change()

            if changed:
                if self.adwin._width < self.window._num_samples:
                    for i in range(self.window._num_samples, self.adwin._width,
                                   -1):
                        self.window.delete_element()
        return self
    class AdaLearningNode(LearningNodeNBAdaptive, NewNode):
        def __init__(self, initial_class_observations):
            super().__init__(initial_class_observations)
            self._estimation_error_weight = ADWIN()
            self.error_change = False
            self._randomSeed = 1
            self._classifier_random = check_random_state(self._randomSeed)

        # Override NewNode
        def number_leaves(self):
            return 1

        # Override NewNode
        def get_error_estimation(self):
            return self._estimation_error_weight.estimation

        # Override NewNode
        def get_error_width(self):
            return self._estimation_error_weight.width

        # Override NewNode
        def is_null_error(self):
            return self._estimation_error_weight is None

        def kill_tree_children(self, hat):
            pass

        # Override NewNode
        def learn_from_instance(self, X, y, weight, hat, parent,
                                parent_branch):
            true_class = y

            # k = self._classifier_random.poisson(1.0)
            # if k > 0:
            #     weight = weight * k

            tmp = self.get_class_votes(X, hat)

            class_prediction = get_max_value_key(tmp)

            bl_correct = (true_class == class_prediction)

            if self._estimation_error_weight is None:
                self._estimation_error_weight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to Adwin
            add = 0.0 if (bl_correct is True) else 1.0

            self._estimation_error_weight.add_element(add)
            # Detect change with Adwin
            self.error_change = self._estimation_error_weight.detected_change()

            if self.error_change is True and old_error > self.get_error_estimation(
            ):
                self.error_change = False

            # Update statistics
            super().learn_from_instance(X, y, weight, hat)

            # call ActiveLearningNode
            weight_seen = self.get_weight_seen()

            if weight_seen - self.get_weight_seen_at_last_split_evaluation(
            ) >= hat.grace_period:
                hat._attempt_to_split(self, parent, parent_branch)
                self.set_weight_seen_at_last_split_evaluation(weight_seen)

        # Override LearningNodeNBAdaptive
        def get_class_votes(self, X, ht):
            # dist = {}
            prediction_option = ht.leaf_prediction
            # MC
            if prediction_option == MAJORITY_CLASS:
                dist = self.get_observed_class_distribution()
            # NB
            elif prediction_option == NAIVE_BAYES:
                dist = do_naive_bayes_prediction(
                    X, self._observed_class_distribution,
                    self._attribute_observers)
            # NBAdaptive
            else:
                if self._mc_correct_weight > self._nb_correct_weight:
                    dist = self.get_observed_class_distribution()
                else:
                    dist = do_naive_bayes_prediction(
                        X, self._observed_class_distribution,
                        self._attribute_observers)

            dist_sum = sum(dist.values())  # sum all values in dictionary
            normalization_factor = dist_sum * self.get_error_estimation(
            ) * self.get_error_estimation()

            if normalization_factor > 0.0:
                normalize_values_in_dict(dist, normalization_factor)

            return dist

        # Override NewNode, New for option votes
        def filter_instance_to_leaves(self,
                                      X,
                                      y,
                                      weight,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []
            found_nodes.append(
                HoeffdingTree.FoundNode(self, parent, parent_branch))
class AdaLearningNodeForRegression(ActiveLearningNodePerceptron, AdaNode):
    """ Learning Node of the Regression Hoeffding Adaptive Tree that always use
    a linear perceptron model to provide responses.

    Parameters
    ----------
    initial_class_observations: dict
        In regression tasks this dictionary carries the sufficient to perform
        online variance calculation. They refer to the number of observations
        (key '0'), the sum of the target values (key '1'), and the sum of the
        squared target values (key '2').
    perceptron_weight: np.ndarray(n_features) or None, optional (default=None)
        (default=None)
        The weights for the linear models. If
        not passed, uniform values in the range [-1, 1] are used.
    random_state: int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    """
    def __init__(self,
                 initial_class_observations,
                 perceptron_weight,
                 random_state=None):
        super().__init__(initial_class_observations, perceptron_weight,
                         random_state)
        self._estimation_error_weight = ADWIN()
        self._error_change = False
        self._randomSeed = 1
        self._classifier_random = check_random_state(self._randomSeed)

    # Override AdaNode
    def number_leaves(self):
        return 1

    # Override AdaNode
    def get_error_estimation(self):
        return self._estimation_error_weight.estimation

    # Override AdaNode
    def get_error_width(self):
        return self._estimation_error_weight.width

    # Override AdaNode
    def is_null_error(self):
        return self._estimation_error_weight is None

    def kill_tree_children(self, hat):
        pass

    # Override AdaNode
    def learn_from_instance(self, X, y, weight, rhat, parent, parent_branch):

        super().learn_from_instance(X, y, weight, rhat)

        true_target = y
        target_prediction = rhat.predict([X])[0]

        normalized_error = rhat.get_normalized_error(target_prediction,
                                                     true_target)

        if self._estimation_error_weight is None:
            self._estimation_error_weight = ADWIN()

        old_error = self.get_error_estimation()

        # Add element to Adwin

        self._estimation_error_weight.add_element(normalized_error)
        # Detect change with Adwin
        self._error_change = self._estimation_error_weight.detected_change()

        if self._error_change is True and old_error > self.get_error_estimation(
        ):
            self._error_change = False

        # call ActiveLearningNode
        weight_seen = self.get_weight_seen()

        if weight_seen - self.get_weight_seen_at_last_split_evaluation(
        ) >= rhat.grace_period:
            rhat._attempt_to_split(self, parent, parent_branch)
            self.set_weight_seen_at_last_split_evaluation(weight_seen)

    # Override AdaNode, New for option votes
    def filter_instance_to_leaves(self,
                                  X,
                                  y,
                                  weight,
                                  parent,
                                  parent_branch,
                                  update_splitter_counts,
                                  found_nodes=None):
        if found_nodes is None:
            found_nodes = []
        found_nodes.append(FoundNode(self, parent, parent_branch))
Ejemplo n.º 22
0
    gdf_ais.within(port_area.values[0].envelope), :]

gdf_aisByHourPortOfInterest = []
for hour in uniqueHours:
    gdf_aisByHourPortOfInterest.append(signalsWithinPortArea.loc[
        signalsWithinPortArea['hour'] == hour].shape[0])

hours = range(0, len(uniqueHours), 1)

plt.plot(hours, gdf_aisByHourPortOfInterest)
plt.show()

## Q5
print(
    "******************************* Q5 *************************************")
## reference - lab 7
import numpy as np
from skmultiflow.drift_detection.adwin import ADWIN
adwin = ADWIN()

for i in hours:
    adwin.add_element(gdf_aisByHourPortOfInterest[i])
    if adwin.detected_change():
        print('Change detected in data: ' +
              str(gdf_aisByHourPortOfInterest[i]) + ' - at index: ' + str(i))

## Q6
print(
    "******************************* Q6 *************************************")
# Clustering ports based on message density. We are using data from Q1 where
from sklearn.cluster import DBSCAN
    class AdaSplitNodeForRegression(SplitNode, NewNode):
        def __init__(self, split_test, class_observations):
            super().__init__(split_test, class_observations)
            self._estimation_error_weight = ADWIN()
            self._alternate_tree = None
            self.error_change = False
            self._random_seed = 1
            self._classifier_random = check_random_state(self._random_seed)

        # Override SplitNode
        def calc_byte_size_including_subtree(self):
            byte_size = self.__sizeof__()
            if self._alternate_tree is not None:
                byte_size += self._alternate_tree.calc_byte_size_including_subtree(
                )
            if self._estimation_error_weight is not None:
                byte_size += self._estimation_error_weight.get_length_estimation(
                )

            for child in self._children:
                if child is not None:
                    byte_size += child.calc_byte_size_including_subtree()

            return byte_size

        # Override NewNode
        def number_leaves(self):
            num_of_leaves = 0
            for child in self._children:
                if child is not None:
                    num_of_leaves += child.number_leaves()

            return num_of_leaves

        # Override NewNode
        def get_error_estimation(self):
            return self._estimation_error_weight.estimation

        # Override NewNode
        def get_error_width(self):
            w = 0.0
            if self.is_null_error() is False:
                w = self._estimation_error_weight.width

            return w

        # Override NewNode
        def is_null_error(self):
            return self._estimation_error_weight is None

        # Override NewNode
        def learn_from_instance(self, X, y, weight, rhat, parent,
                                parent_branch):

            true_target = y

            normalized_error = 0.0

            if self.filter_instance_to_leaf(X, parent,
                                            parent_branch).node is not None:
                target_prediction = rhat.predict([X])[0]
                normalized_error = rhat.get_normalized_error(
                    target_prediction, true_target)
            if self._estimation_error_weight is None:
                self._estimation_error_weight = ADWIN()

            old_error = self.get_error_estimation()

            # Add element to Change detector
            self._estimation_error_weight.add_element(normalized_error)

            # Detect change
            self.error_change = self._estimation_error_weight.detected_change()

            if self.error_change is True and old_error > self.get_error_estimation(
            ):

                self.error_change = False

            # Check condition to build a new alternate tree
            if self.error_change is True:
                self._alternate_tree = rhat._new_learning_node()
                rhat.alternate_trees_cnt += 1

            # Condition to replace alternate tree
            elif self._alternate_tree is not None and self._alternate_tree.is_null_error(
            ) is False:
                print("we'll be replacing the actual tree")
                if self.get_error_width() > error_width_threshold \
                        and self._alternate_tree.get_error_width() > error_width_threshold:
                    old_error_rate = self.get_error_estimation()
                    alt_error_rate = self._alternate_tree.get_error_estimation(
                    )
                    fDelta = .05
                    fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / (
                        self.get_error_width())

                    bound = math.sqrt(2.0 * old_error_rate *
                                      (1.0 - old_error_rate) *
                                      math.log(2.0 / fDelta) * fN)
                    # To check, bound never less than (old_error_rate - alt_error_rate)
                    if bound < (old_error_rate - alt_error_rate):
                        rhat._active_leaf_node_cnt -= self.number_leaves()
                        rhat._active_leaf_node_cnt += self._alternate_tree.number_leaves(
                        )
                        self.kill_tree_children(rhat)

                        if parent is not None:
                            parent.set_child(parent_branch,
                                             self._alternate_tree)
                        else:
                            rhat._tree_root = rhat._tree_root._alternate_tree
                        rhat.switch_alternate_trees_cnt += 1
                    elif bound < alt_error_rate - old_error_rate:
                        if isinstance(self._alternate_tree,
                                      HoeffdingTree.ActiveLearningNode):
                            self._alternate_tree = None
                        elif isinstance(self._alternate_tree,
                                        HoeffdingTree.ActiveLearningNode):
                            self._alternate_tree = None
                        else:
                            self._alternate_tree.kill_tree_children(rhat)
                        rhat.pruned_alternate_trees_cnt += 1  # hat.pruned_alternate_trees_cnt to check

            # Learn_From_Instance alternate Tree and Child nodes
            if self._alternate_tree is not None:
                self._alternate_tree.learn_from_instance(
                    X, y, weight, rhat, parent, parent_branch)
            child_branch = self.instance_child_index(X)
            child = self.get_child(child_branch)
            if child is not None:
                child.learn_from_instance(X, y, weight, rhat, parent,
                                          parent_branch)

        # Override NewNode
        def kill_tree_children(self, rhat):
            for child in self._children:
                if child is not None:
                    # Delete alternate tree if it exists
                    if isinstance(child, rhat.AdaSplitNodeForRegression
                                  ) and child._alternate_tree is not None:
                        self._pruned_alternate_trees += 1
                    # Recursive delete of SplitNodes
                    if isinstance(child, rhat.AdaSplitNodeForRegression):
                        child.kill_tree_children(rhat)

                    if isinstance(child, HoeffdingTree.ActiveLearningNode):
                        child = None
                        rhat._active_leaf_node_cnt -= 1
                    elif isinstance(child, HoeffdingTree.InactiveLearningNode):
                        child = None
                        rhat._inactive_leaf_node_cnt -= 1

        # override NewNode
        def filter_instance_to_leaves(self,
                                      X,
                                      y,
                                      weight,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts=False,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []
            if update_splitter_counts:

                try:

                    self._observed_class_distribution[0] += weight
                    self._observed_class_distribution[1] += y * weight
                    self._observed_class_distribution[2] += y * y * weight

                except KeyError:

                    self._observed_class_distribution[0] = weight
                    self._observed_class_distribution[1] = y * weight
                    self._observed_class_distribution[2] = y * y * weight

            child_index = self.instance_child_index(X)
            if child_index >= 0:
                child = self.get_child(child_index)
                if child is not None:
                    child.filter_instance_to_leaves(X, y, weight, parent,
                                                    parent_branch,
                                                    update_splitter_counts,
                                                    found_nodes)
                else:
                    found_nodes.append(
                        HoeffdingTree.FoundNode(None, self, child_index))
            if self._alternate_tree is not None:
                self._alternate_tree.filter_instance_to_leaves(
                    X, y, weight, self, -999, update_splitter_counts,
                    found_nodes)
Ejemplo n.º 24
0
  def _partial_fit(self, X, y):

          """ Trains the model on samples X and corresponding targets y.
          Private function where actual training is carried on.
          Parameters
          ----------
          X: numpy.ndarray of shape (1, n_features)
              Instance attributes.
          y: int
              Class label for sample X. Not used in this implementaion which is Unsupervised
          """ 
          
          """
          Reshape X and add it to our window if it isn't full.
          If it's full, give window to our precedent_window.
          If we are at the end our window, fit if we're learning 
          Check the anomaly score of our window 
          Update if self.anomaly_rate > self.drift_threshold

          """
          X = np.reshape(X,(1,len(X)))

          if self.samples_seen % self.window_size == 0:
            ## Update the two windows (precedent one and current windows)
            self.prec_window = self.window
            self.window = X
          else:
            self.window = np.concatenate((self.window,X))
          

          if self.samples_seen % self.window_size == 0 and self.samples_seen !=0:
              #Fit the ensemble if it's not empty
              if(self.cpt<self.n_estimators):
                self.ensemble.fit(self.prec_window)
                self.cpt += 1  

              if(self.version == "AnomalyRate"):
                  ## Update the current anomaly score
                  self.anomaly_rate = self.anomaly_scores_rate(self.prec_window) ## Anomaly rate
                  #print(self.anomaly_rate) ## 
                  ## Update the model if the anomaly rate is greater than the threshold (u in the original paper [3])
                  if self.anomaly_rate > self.drift_threshold: ## Use Anomaly RATE ?
                    self.update_model(self.prec_window) # This function will discard completly the old model and create a new one
              
              elif(self.version == "ADWIN"):
                  print('start adwin version')
                  #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow
                  from skmultiflow.drift_detection.adwin import ADWIN
                  adwin = ADWIN()
                  prec_window_scores = self.ensemble.anomaly_score(self.prec_window)
                  #print(prec_window_scores)
                  print('Before  add element to adwin')
                  drift_detected = False
                  ind = 0
                  for score in prec_window_scores:
                      #adwin.add_element(prec_window_scores)
                      print("added score = "+ str(score) + " on index = "+ str(ind))
                      adwin.add_element(score)
                      print('start change detection')
                      if adwin.detected_change():
                          print('Change detected on index = '+ str(ind))
                          drift_detected = True
                          #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i]))
                          break;
                      ind = ind + 1
                  if(drift_detected):  
                        print('start model updating')
                        self.update_model(self.prec_window)
                    
          self.samples_seen += 1
Ejemplo n.º 25
0
import numpy as np
from skmultiflow.drift_detection.adwin import ADWIN
from skmultiflow.drift_detection.eddm import EDDM
import matplotlib.pyplot as plt
import math
from random import gauss
adwin = ADWIN()
# eddm = EDDM()
# Simulating a data stream as a normal distribution of 1's and 0's
my_mean = 10
my_variance = 0.1

data_stream = [gauss(my_mean, math.sqrt(my_variance)) for i in range(500)]

# data_stream = np.random.randint(10, size=100)
# data_stream = [1,0,1,0,1,0,1,1,0,1,1,1,1,1,1]
# Changing the data concept from index 999 to 2000
# for i in range(50, 100):
#     data_stream[i] = np.random.randint(4, high=8)
# Adding stream elements to ADWIN and verifying if drift occurred
print(np.mean(data_stream))
plt.plot(data_stream)
plt.show()

for i in range(len(data_stream)):
    adwin.add_element(data_stream[i])
    if adwin.detected_change():
        print('Change detected in data: ' + str(data_stream[i]) +
              ' - at index: ' + str(i))