def get_avgcodisp(instances): # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(instances, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point... # new_codisp = tree.codisp(index) # And take the average over all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += tree.codisp(index) / num_trees return avg_codisp
def robust_random_cut(self, sketch_vector): # Set tree parameters sketch_vector = sketch_vector.sort_values(by='graphid', ascending=False) sketch = sketch_vector['sketch'].tolist() sketch = preprocessing.scale(sketch) num_trees = 50 shingle_size = 1 #args.win_size tree_size = 32 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(sketch, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... if index % 50 == 0: print("Index: ", index) for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point... new_codisp = tree.codisp(index) # And take the average over all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += new_codisp / num_trees # print(avg_codisp) disp = pd.Series([avg_codisp[s] for s in avg_codisp]) pred_rrcf = disp > disp.quantile(0.95) print( metrics.classification_report(np.array(sketch_vector['anomaly']), pred_rrcf)) # plt.plot(disp) # plt.plot(disp, marker='.') # plt.show() return pred_rrcf, disp
def find_anomalies(input): # Set tree parameters num_trees = 40 shingle_size = 1 tree_size = 256 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) inputPoints = list(map(lambda x: x['value'], input)) points = rrcf.shingle(inputPoints, size=shingle_size) avg_codisp = {} disp = {} # For each shingle... for index, point in enumerate(inputPoints): # For each tree in the forest... for tree in forest: # If tree is above permitted size, drop the oldest point (FIFO) if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point and take the average among all trees if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += tree.codisp(index) / num_trees disp[index] = tree.disp(index) output = [] for i in range(len(input)): codisp = avg_codisp[i] point = {} point['value'] = input[i]['value'] point['timestamp'] = input[i]['timestamp'] point['isAnomaly'] = codisp > 40 point['codisp'] = codisp output.append(point) return output
def anomaly_score(self, points): ''' Computes anomaly score for point by inserting point in each tree in the rcrf and computing the average collusive displacement. Higher scores indicate a higher displacement and thus a higher likelihood of anomaly. It then deletes points from each tree to maintain the state of the forest. (If you want new points to be added to trees, you can use the function stream_anomaly_scores with `new_forest` == False) Parameters: points: the points on which to calculate anomaly scores. np.ndarray of size (n x d) Returns: anomaly_score: pandas Series with index of points and average collusive displacement (anomaly score) for each point ''' # assert that fit function has already been called and a forest exists try: assert(self.forest is not None) except: raise ValueError("Cannot compute anomaly scores on forest that has not been fit") # assert that points.shape has two dimensions try: assert(len(points.shape) is 2) except: raise ValueError("Points must have shape (n x d)") # assert that dimension of points is the same as dimension in fit tree try: assert(points.shape[1] == self.dimension) except: raise ValueError("Forest and points must have the same dimension. Points dim: {} vs. Forest dim: {}".format(points.shape[1], self.dimension)) # scale mean and variance of points scaled_points = preprocessing.scale(points) avg_codisp = pd.Series(0.0, index=np.arange(points.shape[0])) points = rrcf_base.shingle(points, size = 1) for index, point in enumerate(scaled_points): for tree in self.forest: tree.insert_point(point, index='point') codisp = tree.codisp('point') avg_codisp[index] += codisp tree.forget_point('point') return avg_codisp / self.num_trees
def stream_anomaly_scores(self, points, window_size, new_forest = False): ''' Computes anomaly scores for all points in a stream by computing the average collusive displacement. The assumption is that each point in the stream is only observed sequentially. Higher scores indicate a higher displacement and thus a higher likelihood of anomaly. If existing forest does not exist, or existing forest does exist with a different window size, create a new forest starting with the first point in the stream. Parameters: points: the stream of point on which to calculate anomaly scores window_size: the window size in which to ingest points. points are mapped as a n-dimensional window, where n = window_size new_forest: boolean that identifies whether to create a new forest or not Returns: anomaly_scores: pandas Series with index of points and average collusive displacement (anomaly score) for each point ''' # create a new empty forest if forest does not exit or forest does exist, but # with different window size if self.forest is None or new_forest: self.num_points = 0 forest = [] for i in range(self.num_trees): tree = rrcf_base.RCTree() forest.append(tree) self.ixs[i] = [] self.forest = forest # scale mean and variance of points #scaled_points = preprocessing.scale(points) #print(scaled_points.shape) # create rolling window of size window_size points_gen = rrcf_base.shingle(points, size=window_size) # calculate streaming anomaly scores avg_codisp = pd.Series(0.0, index=np.arange(self.num_points, self.num_points + points.shape[0])) initial_index = self.num_points for index, point in enumerate(points_gen): index += initial_index for tree_idx, tree in enumerate(self.forest): # If tree is above permitted size, drop the oldest point (FIFO) # TODO: forget oldest point or another random point with prob if len(tree.leaves) >= self.tree_size: forget_index = min(self.ixs[tree_idx]) tree.forget_point(forget_index) self.ixs[tree_idx] = np.delete(self.ixs[tree_idx], np.argwhere(self.ixs[tree_idx] == forget_index)) # Insert the new point into the tree try: tree.insert_point(point, index=index) self.ixs[tree_idx] = np.append(self.ixs[tree_idx], index) except: ValueError('failure for point {} at index {}'.format(point, index)) # Compute codisp on the new point and take the average among all trees avg_codisp[index] += tree.codisp(index) self.num_points += 1 return avg_codisp / self.num_trees
sin_test = A * np.sin(T * t - phi * T) + center sin_test[235:255] = 80 sin_test[500:750] = 80 # Set tree parameters num_trees = 40 shingle_size = 6 tree_size = 256 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) points = rrcf.shingle(sin, size=shingle_size) points_test = rrcf.shingle(sin_test, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Drop the newest point (LIFO)
def test_shingle(): shingle = rrcf.shingle(X, 3) step_0 = next(shingle) step_1 = next(shingle) assert (step_0[1] == step_1[0]).all()
lc_data = ut_lc.getDataFromFile(fileName=listFile[101], height=height, duration=duration) # sketchInstances = ut_gen.genFixBin(binSize=binSize,instances=lc_data['instances']) # notEx = ut_gen.genFixBin(binSize=binSize,instances=lc_data['instances'],isExtract=False) # # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(lc_data['instances'], size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point...
xseries = df.iloc[:, i].interpolate(method='linear').to_numpy() if all(np.isnan(xseries)): continue if np.isnan(xseries[0]): xseries[0] = xseries[1] #create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) #create rolling window points = rrcf.shingle(xseries, size=shingle_size) avg_codisp = {} for index, point in enumerate(points): if index % 2000 == 0: # if index > 16000: print('point' + str(index)) # if index == 17920: # raise ValueError('a') for tree in forest: #drop the oldest point (FIFO) if tree is too big if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) tree.insert_point(point, index=index)
plt.title("Luminol on Feature 1") plt.show() #%% Robust Random Cut Forest Algorithm import rrcf X = df["Feature1"].tolist() TREE_COUNT = 100 SHINGLE_COUNT = 8 TREE_SIZE = 200 forest = [] for i in range(TREE_COUNT): tree = rrcf.RCTree() forest.append(tree) points = rrcf.shingle(X, size=SHINGLE_COUNT) avg_codisp = {} for index, point in enumerate(points): for tree in forest: if len(tree.leaves) > TREE_SIZE: tree.forget_point(index - TREE_SIZE) tree.insert_point(point, index=index) if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += tree.codisp(index) / TREE_COUNT time_periods = [] values = [] for key in avg_codisp:
# ax = plt.subplot(111) # plt.plot(stream_data) # Set tree parameters num_trees = 40 shingle_size = 4 tree_size = 256 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(stream_data, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size, drop the oldest point (FIFO) if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point and take the average among all trees if not index in avg_codisp:
sin = A * np.sin(T * t - phi * T) + center sin[235:255] = 80 # Set tree parameters num_trees = 40 shingle_size = 4 tree_size = 256 # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) # Use the "shingle" generator to create rolling window points = rrcf.shingle(sin, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point...
def anomaly_detect(df, flagdf, num_trees, shingle_size, tree_size): # df = z # Set tree parameters for robust random cut forest (rrcf) # num_trees = 40#40 # shingle_size = 20 # tree_size = 64#256 codisps = {} # i=1 # pp = list(enumerate(points)) # index, point = pp[17920] # tree=forest[0] # np.isnan(xseries) # xseries.interpolate(method='linear') # all(x.iloc[:,i].isnull()) # z.iloc[1,3] = np.nan # z.pH.interpolate() for i in range(df.shape[1]): print('var' + str(i)) varname = df.columns[i] xseries = df.iloc[:,i].interpolate(method='linear').to_numpy() if all(np.isnan(xseries)): continue if np.isnan(xseries[0]): xseries[0] = xseries[1] #create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) #create rolling window points = rrcf.shingle(xseries, size=shingle_size) avg_codisp = {} for index, point in enumerate(points): if index % 2000 == 0: # if index > 16000: print('point' + str(index)) # if index == 17920: # raise ValueError('a') for tree in forest: #drop the oldest point (FIFO) if tree is too big if len(tree.leaves) > tree_size: tree.forget_point(index - tree_size) tree.insert_point(point, index=index) #compute collusive displacement on the inserted point new_codisp = tree.codisp(index) #take the average codisp across all trees; that's anomaly score if not index in avg_codisp: avg_codisp[index] = 0 avg_codisp[index] += new_codisp / num_trees codisps[varname] = avg_codisp # c='WaterTemp_C' for c in list(codisps.keys()): avg_codisp = codisps[c] #get top 2% of anomaly scores; flag those points with +2 avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index', columns=['score']) thresh = float(avg_codisp_df.quantile(0.98)) outl_inds_bool = avg_codisp_df.loc[:,'score'] > thresh outl_inds_int = outl_inds_bool[outl_inds_bool].index outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c] flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2 df.loc[df.index[outl_inds_int], varname] = np.nan # outl_inds = avg_codisp_df[outl_inds_bool] # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True, # right_index=True) return (df, flagdf)
print(df.head()) # Set forest parameters num_trees = 100 tree_size = 256 shingle_size = 6 n = df.shape[0] sample_size_range = (n // tree_size, tree_size) # Create a forest of empty trees forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) points = rrcf.shingle(ndf, size=shingle_size) # Create a dict to store anomaly score of each point avg_codisp = {} # For each shingle... for index, point in enumerate(points): # For each tree in the forest... for tree in forest: # If tree is above permitted size... if len(tree.leaves) > tree_size: # Drop the oldest point (FIFO) tree.forget_point(index - tree_size) # Insert the new point into the tree tree.insert_point(point, index=index) # Compute codisp on the new point...
print() print("Total puntos con anomalias:", totalPtosAnomalia ) #Establecer parametros de arbol num_trees = 20 shingle_size = 2 tree_size = 15 #Crear un bosque de arboles vacios forest = [] for _ in range(num_trees): tree = rrcf.RCTree() forest.append(tree) #Se usa el generador de "shingle" para crear una ventana movil pointsTrain = rrcf.shingle(DataTrainWiTri, size=shingle_size) pointsTest = rrcf.shingle(DataTestWiTri, size=shingle_size) #Crear un dict para almacenar el puntaje de anomalía de cada punto avg_codisp = {} avg_codispTest = {} #Arbol para los datos de entrenamiento def train (forest): print("TRAIN") # Por cada shingle ... for index, point in enumerate(pointsTrain): #Por cada arbol en el bosque... for tree in forest: #Inserta el nuevo punto en el árbol tree.insert_point(point, index=index)
'2015-01-02 00:00:00'), 'blizzard' : ('2015-01-26 00:00:00', '2015-01-28 00:00:00') } taxi['event'] = np.zeros(len(taxi)) for event, duration in events.items(): start, end = duration taxi.loc[start:end, 'event'] = 1 # Set tree parameters num_trees = 200 shingle_size = 48 tree_size = 1000 # Use the "shingle" generator to create rolling window points = rrcf.shingle(data, size=shingle_size) points = np.vstack([point for point in points]) n = points.shape[0] sample_size_range = (n // tree_size, tree_size) forest = [] while len(forest) < num_trees: ixs = np.random.choice(n, size=sample_size_range, replace=False) trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs] forest.extend(trees) avg_codisp = pd.Series([0.0]*n, index=np.arange(n)) index = np.zeros(n)