def test_external_cluster(): ext_data = MeanShiftCluster() ext_data.update([[2, 1], [4, 65], [4, 3]]) cls = SetSimilarityHardThreshold(metric='ABOD', threshold=0.7, cluster=ext_data) dec, scores = cls.predict([[2.2, 1], [4, 1]]) print dec, scores
def __init__(self, test_offline=False, cluster=None): ABOD.__init__(self) self.__test_offline = test_offline if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster
def __init__(self, random_data, cluster=None): # load random data self.random_data = random_data self.clf = SVC(kernel='linear', probability=True, C=1) if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster
def __init__(self, cluster=None, metric='ABOD'): if cluster is None: print "No data cluster linked. Using new MeanShiftCluster." self.data_cluster = MeanShiftCluster() self.__external_cluster = False else: self.data_cluster = cluster self.metric = metric self.cluster_timestamp = time.time()
class BinaryThreshold: __verbose = False clf = None thresh = 0.99 random_data = None data_cluster = None avg = None def __init__(self, cluster=None): if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster def partial_fit(self, samples): self.data_cluster.update(samples) def class_mean_dist(self, samples, metric='cosine'): return self.data_cluster.class_mean_dist(samples, metric) def predict(self, samples, class_mean=False, thresh=None): print "--- classifying {} samples...".format(len(samples)) # dist if class_mean is True: # Distance to class mean for every sample dist = self.data_cluster.class_mean_dist(samples, 'euclidean') else: dist = pairwise_distances(samples, self.data_cluster.data, metric='euclidean') # square dist_squared = np.square(dist) # average avg = np.average(dist_squared, axis=0) self.avg = avg # threshold if thresh is not None: return avg < thresh else: return avg < self.thresh def decision_function(self, samples): pass
def add_samples(self, user_id, new_samples): """embeddings: array of embeddings""" if user_id not in self.class_clusters: # initialize self.class_clusters[user_id] = MeanShiftCluster(max_size=60) self.class_clusters[user_id].update(new_samples) else: # update self.class_clusters[user_id].update(new_samples)
class IABOD(ABOD): # todo: refactor - avoid coping data from HullCluster to class # (originally needed for superclass to access it) data = [] __verbose = False __test_offline = False data_cluster = None # todo: remove test online def __init__(self, test_offline=False, cluster=None): ABOD.__init__(self) self.__test_offline = test_offline if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster def fit(self, data, dim_reduction=False): raise NotImplementedError("Use 'partial_fit' instead of 'fit'") def partial_fit(self, samples): if self.__test_offline is True: if len(self.data) == 0: self.data = samples elif len(self.data) < 40: self.data = np.concatenate((self.data, samples)) else: self.data_cluster.update(samples) self.data = self.data_cluster.get_data() def mean_dist(self, samples, metric='cosine'): return self.data_cluster.mean_dist(samples, metric) def class_mean_dist(self, samples, metric='cosine'): return self.data_cluster.class_mean_dist(samples, metric)
class SetSimilarityThresholdBase: """ SetSimilarityThreshold calculates a per-sample outlier/similarity score which is thresholded for classification """ __verbose = False __external_cluster = True # has an external data model data_cluster = None # hashed result buffer decision_fn_buffer = {} def __init__(self, cluster=None, metric='ABOD'): if cluster is None: print "No data cluster linked. Using new MeanShiftCluster." self.data_cluster = MeanShiftCluster() self.__external_cluster = False else: self.data_cluster = cluster self.metric = metric self.cluster_timestamp = time.time() def partial_fit(self, samples): if self.__external_cluster: # DONT UPDATE EXTERNAL CLUSTERS! pass else: # UPDATE INTERNAL CLUSTER (mainly for testing) self.data_cluster.update(samples) # invalid buffered decision function self.decision_fn_buffer = {} def get_hash(self, arr): arr.flags.writeable = False h = hash(arr.data) arr.flags.writeable = True return h def decision_function(self, samples): """ Distance of the samples X to the target class distribution :param samples: :return: """ cluster_type = self.data_cluster.__class__.__name__ if cluster_type != 'MeanShiftCluster': log.severe( "Prediction for cluster type '{}' is not implemented yet! Add custom decision_function() first." .format(cluster_type)) raise NotImplementedError( "Implement threshold prediction for specific cluster type.") # calc hashes hashed = [self.get_hash(s) for s in samples] # check intersections and use buffered results if self.decision_fn_buffer: # ind_samples = dict((k, i) for i, k in enumerate(hashed)) intersec_hashes = list( set(self.decision_fn_buffer.keys()) & set(hashed)) similarity_scores = [] for i, h in enumerate(hashed): if h in intersec_hashes: similarity_scores.append(self.decision_fn_buffer[h]) else: score = self.data_cluster.sample_set_similarity_scores( np.array([samples[i]]), self.metric) similarity_scores.append(score) # add to buffer self.decision_fn_buffer[h] = score else: similarity_scores = self.data_cluster.sample_set_similarity_scores( samples, self.metric) # add to buffer for i, h in enumerate(hashed): self.decision_fn_buffer[h] = similarity_scores[i] similarity_scores = np.array(similarity_scores).flatten() return similarity_scores @abstractmethod def predict(self, samples): """ Specifies how to update self.data with incomming samples """ raise NotImplementedError("Implement Cluster Update.")
def __init__(self, cluster=None): if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster
class ISVM: __verbose = False clf = None uncertainty_thresh = 0.7 random_data = None data_cluster = None # prediction prediction = None probability = None def __init__(self, random_data, cluster=None): # load random data self.random_data = random_data self.clf = SVC(kernel='linear', probability=True, C=1) if cluster is None: self.data_cluster = MeanShiftCluster() else: assert issubclass(cluster, ClusterBase) self.data_cluster = cluster def decision_function(self, samples): pass def get_proba(self): # probability that it is the class (uncertain samples not counted) prob = 0 prob += np.sum(self.probability[:, 1][self.prediction == 1]) prob += np.sum(1 - self.probability[:, 1][self.prediction == -1]) prob /= len(self.probability[:, 1][self.prediction != 0]) return prob def mean_dist(self, samples, metric='cosine'): return self.data_cluster.mean_dist(samples, metric) def class_mean_dist(self, samples, metric='cosine'): return self.data_cluster.class_mean_dist(samples, metric) def predict(self, samples): proba = self.clf.predict_proba(samples) self.probability = proba mask_1 = np.sum(proba < self.uncertainty_thresh, axis=1) == 2 pred = np.array([-1 if r[0] > 0.5 else 1 for r in proba]) pred[mask_1] = 0 self.prediction = pred return pred def __fit_vs_random(self, class_data): label_class = np.repeat(1, np.shape(class_data)[0]) label_unknown = np.repeat(-1, np.shape(self.random_data)[0]) training_embeddings = np.concatenate((class_data, self.random_data)) training_labels = np.concatenate((label_class, label_unknown)) self.clf.fit(training_embeddings, training_labels) def partial_fit(self, samples): self.data_cluster.update(samples) reduced_data = self.data_cluster.get_data() # refit SVM one vs random self.__fit_vs_random(reduced_data)