Exemple #1
0
def test_external_cluster():

    ext_data = MeanShiftCluster()
    ext_data.update([[2, 1], [4, 65], [4, 3]])
    cls = SetSimilarityHardThreshold(metric='ABOD', threshold=0.7, cluster=ext_data)
    dec, scores = cls.predict([[2.2, 1], [4, 1]])
    print dec, scores
Exemple #2
0
 def __init__(self, test_offline=False, cluster=None):
     ABOD.__init__(self)
     self.__test_offline = test_offline
     if cluster is None:
         self.data_cluster = MeanShiftCluster()
     else:
         assert issubclass(cluster, ClusterBase)
         self.data_cluster = cluster
Exemple #3
0
 def __init__(self, random_data, cluster=None):
     # load random data
     self.random_data = random_data
     self.clf = SVC(kernel='linear', probability=True, C=1)
     if cluster is None:
         self.data_cluster = MeanShiftCluster()
     else:
         assert issubclass(cluster, ClusterBase)
         self.data_cluster = cluster
    def __init__(self, cluster=None, metric='ABOD'):

        if cluster is None:
            print "No data cluster linked. Using new MeanShiftCluster."
            self.data_cluster = MeanShiftCluster()
            self.__external_cluster = False
        else:
            self.data_cluster = cluster

        self.metric = metric
        self.cluster_timestamp = time.time()
class BinaryThreshold:

    __verbose = False

    clf = None
    thresh = 0.99

    random_data = None
    data_cluster = None

    avg = None

    def __init__(self, cluster=None):
        if cluster is None:
            self.data_cluster = MeanShiftCluster()
        else:
            assert issubclass(cluster, ClusterBase)
            self.data_cluster = cluster

    def partial_fit(self, samples):
        self.data_cluster.update(samples)

    def class_mean_dist(self, samples, metric='cosine'):
        return self.data_cluster.class_mean_dist(samples, metric)

    def predict(self, samples, class_mean=False, thresh=None):

        print "--- classifying {} samples...".format(len(samples))

        # dist
        if class_mean is True:
            # Distance to class mean for every sample
            dist = self.data_cluster.class_mean_dist(samples, 'euclidean')
        else:
            dist = pairwise_distances(samples,
                                      self.data_cluster.data,
                                      metric='euclidean')

        # square
        dist_squared = np.square(dist)

        # average
        avg = np.average(dist_squared, axis=0)
        self.avg = avg

        # threshold
        if thresh is not None:
            return avg < thresh
        else:
            return avg < self.thresh

    def decision_function(self, samples):
        pass
Exemple #6
0
 def add_samples(self, user_id, new_samples):
     """embeddings: array of embeddings"""
     if user_id not in self.class_clusters:
         # initialize
         self.class_clusters[user_id] = MeanShiftCluster(max_size=60)
         self.class_clusters[user_id].update(new_samples)
     else:
         # update
         self.class_clusters[user_id].update(new_samples)
Exemple #7
0
class IABOD(ABOD):

    # todo: refactor - avoid coping data from HullCluster to class
    # (originally needed for superclass to access it)
    data = []
    __verbose = False
    __test_offline = False
    data_cluster = None

    # todo: remove test online
    def __init__(self, test_offline=False, cluster=None):
        ABOD.__init__(self)
        self.__test_offline = test_offline
        if cluster is None:
            self.data_cluster = MeanShiftCluster()
        else:
            assert issubclass(cluster, ClusterBase)
            self.data_cluster = cluster

    def fit(self, data, dim_reduction=False):
        raise NotImplementedError("Use 'partial_fit' instead of 'fit'")

    def partial_fit(self, samples):

        if self.__test_offline is True:
            if len(self.data) == 0:
                self.data = samples
            elif len(self.data) < 40:
                self.data = np.concatenate((self.data, samples))
        else:
            self.data_cluster.update(samples)
            self.data = self.data_cluster.get_data()

    def mean_dist(self, samples, metric='cosine'):
        return self.data_cluster.mean_dist(samples, metric)

    def class_mean_dist(self, samples, metric='cosine'):
        return self.data_cluster.class_mean_dist(samples, metric)
class SetSimilarityThresholdBase:
    """
    SetSimilarityThreshold calculates a per-sample outlier/similarity score which is thresholded for classification
    """

    __verbose = False
    __external_cluster = True  # has an external data model
    data_cluster = None

    # hashed result buffer
    decision_fn_buffer = {}

    def __init__(self, cluster=None, metric='ABOD'):

        if cluster is None:
            print "No data cluster linked. Using new MeanShiftCluster."
            self.data_cluster = MeanShiftCluster()
            self.__external_cluster = False
        else:
            self.data_cluster = cluster

        self.metric = metric
        self.cluster_timestamp = time.time()

    def partial_fit(self, samples):
        if self.__external_cluster:
            # DONT UPDATE EXTERNAL CLUSTERS!
            pass
        else:
            # UPDATE INTERNAL CLUSTER (mainly for testing)
            self.data_cluster.update(samples)
        # invalid buffered decision function
        self.decision_fn_buffer = {}

    def get_hash(self, arr):
        arr.flags.writeable = False
        h = hash(arr.data)
        arr.flags.writeable = True
        return h

    def decision_function(self, samples):
        """
        Distance of the samples X to the target class distribution
        :param samples:
        :return:
        """

        cluster_type = self.data_cluster.__class__.__name__

        if cluster_type != 'MeanShiftCluster':
            log.severe(
                "Prediction for cluster type '{}' is not implemented yet! Add custom decision_function() first."
                .format(cluster_type))
            raise NotImplementedError(
                "Implement threshold prediction for specific cluster type.")

        # calc hashes
        hashed = [self.get_hash(s) for s in samples]

        # check intersections and use buffered results
        if self.decision_fn_buffer:
            # ind_samples = dict((k, i) for i, k in enumerate(hashed))
            intersec_hashes = list(
                set(self.decision_fn_buffer.keys()) & set(hashed))

            similarity_scores = []
            for i, h in enumerate(hashed):

                if h in intersec_hashes:
                    similarity_scores.append(self.decision_fn_buffer[h])
                else:
                    score = self.data_cluster.sample_set_similarity_scores(
                        np.array([samples[i]]), self.metric)
                    similarity_scores.append(score)
                    # add to buffer
                    self.decision_fn_buffer[h] = score

        else:
            similarity_scores = self.data_cluster.sample_set_similarity_scores(
                samples, self.metric)
            # add to buffer
            for i, h in enumerate(hashed):
                self.decision_fn_buffer[h] = similarity_scores[i]

        similarity_scores = np.array(similarity_scores).flatten()

        return similarity_scores

    @abstractmethod
    def predict(self, samples):
        """
        Specifies how to update self.data with incomming samples
        """
        raise NotImplementedError("Implement Cluster Update.")
 def __init__(self, cluster=None):
     if cluster is None:
         self.data_cluster = MeanShiftCluster()
     else:
         assert issubclass(cluster, ClusterBase)
         self.data_cluster = cluster
Exemple #10
0
class ISVM:

    __verbose = False

    clf = None
    uncertainty_thresh = 0.7

    random_data = None
    data_cluster = None

    # prediction
    prediction = None
    probability = None

    def __init__(self, random_data, cluster=None):
        # load random data
        self.random_data = random_data
        self.clf = SVC(kernel='linear', probability=True, C=1)
        if cluster is None:
            self.data_cluster = MeanShiftCluster()
        else:
            assert issubclass(cluster, ClusterBase)
            self.data_cluster = cluster

    def decision_function(self, samples):
        pass

    def get_proba(self):
        # probability that it is the class (uncertain samples not counted)
        prob = 0
        prob += np.sum(self.probability[:, 1][self.prediction == 1])
        prob += np.sum(1 - self.probability[:, 1][self.prediction == -1])
        prob /= len(self.probability[:, 1][self.prediction != 0])
        return prob

    def mean_dist(self, samples, metric='cosine'):
        return self.data_cluster.mean_dist(samples, metric)

    def class_mean_dist(self, samples, metric='cosine'):
        return self.data_cluster.class_mean_dist(samples, metric)

    def predict(self, samples):
        proba = self.clf.predict_proba(samples)
        self.probability = proba
        mask_1 = np.sum(proba < self.uncertainty_thresh, axis=1) == 2
        pred = np.array([-1 if r[0] > 0.5 else 1 for r in proba])
        pred[mask_1] = 0
        self.prediction = pred
        return pred

    def __fit_vs_random(self, class_data):
        label_class = np.repeat(1, np.shape(class_data)[0])
        label_unknown = np.repeat(-1, np.shape(self.random_data)[0])
        training_embeddings = np.concatenate((class_data, self.random_data))
        training_labels = np.concatenate((label_class, label_unknown))
        self.clf.fit(training_embeddings, training_labels)

    def partial_fit(self, samples):
        self.data_cluster.update(samples)
        reduced_data = self.data_cluster.get_data()
        # refit SVM one vs random
        self.__fit_vs_random(reduced_data)