Beispiel #1
0
def get_avgcodisp(instances):
    # Create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    # Use the "shingle" generator to create rolling window
    points = rrcf.shingle(instances, size=shingle_size)

    # Create a dict to store anomaly score of each point
    avg_codisp = {}

    # For each shingle...
    for index, point in enumerate(points):
        # For each tree in the forest...
        for tree in forest:
            # If tree is above permitted size...
            if len(tree.leaves) > tree_size:
                # Drop the oldest point (FIFO)
                tree.forget_point(index - tree_size)
            # Insert the new point into the tree
            tree.insert_point(point, index=index)
            # Compute codisp on the new point...
            # new_codisp = tree.codisp(index)
            # And take the average over all trees
            if not index in avg_codisp:
                avg_codisp[index] = 0
            avg_codisp[index] += tree.codisp(index) / num_trees
    return avg_codisp
    def robust_random_cut(self, sketch_vector):
        # Set tree parameters

        sketch_vector = sketch_vector.sort_values(by='graphid',
                                                  ascending=False)
        sketch = sketch_vector['sketch'].tolist()
        sketch = preprocessing.scale(sketch)
        num_trees = 50
        shingle_size = 1  #args.win_size
        tree_size = 32

        # Create a forest of empty trees
        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        # Use the "shingle" generator to create rolling window
        points = rrcf.shingle(sketch, size=shingle_size)

        # Create a dict to store anomaly score of each point
        avg_codisp = {}
        # For each shingle...
        for index, point in enumerate(points):
            # For each tree in the forest...
            if index % 50 == 0:
                print("Index: ", index)
            for tree in forest:
                # If tree is above permitted size...
                if len(tree.leaves) > tree_size:
                    # Drop the oldest point (FIFO)
                    tree.forget_point(index - tree_size)
                # Insert the new point into the tree
                tree.insert_point(point, index=index)
                # Compute codisp on the new point...
                new_codisp = tree.codisp(index)
                # And take the average over all trees
                if not index in avg_codisp:
                    avg_codisp[index] = 0
                avg_codisp[index] += new_codisp / num_trees
        # print(avg_codisp)
        disp = pd.Series([avg_codisp[s] for s in avg_codisp])
        pred_rrcf = disp > disp.quantile(0.95)
        print(
            metrics.classification_report(np.array(sketch_vector['anomaly']),
                                          pred_rrcf))
        # plt.plot(disp)
        # plt.plot(disp, marker='.')
        # plt.show()
        return pred_rrcf, disp
Beispiel #3
0
def find_anomalies(input):
    # Set tree parameters
    num_trees = 40
    shingle_size = 1
    tree_size = 256

    # Create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    inputPoints = list(map(lambda x: x['value'], input))

    points = rrcf.shingle(inputPoints, size=shingle_size)

    avg_codisp = {}
    disp = {}

    # For each shingle...
    for index, point in enumerate(inputPoints):
        # For each tree in the forest...
        for tree in forest:
            # If tree is above permitted size, drop the oldest point (FIFO)
            if len(tree.leaves) > tree_size:
                tree.forget_point(index - tree_size)
            # Insert the new point into the tree
            tree.insert_point(point, index=index)
            # Compute codisp on the new point and take the average among all trees
            if not index in avg_codisp:
                avg_codisp[index] = 0
            avg_codisp[index] += tree.codisp(index) / num_trees
            disp[index] = tree.disp(index)

    output = []

    for i in range(len(input)):
        codisp = avg_codisp[i]

        point = {}
        point['value'] = input[i]['value']
        point['timestamp'] = input[i]['timestamp']
        point['isAnomaly'] = codisp > 40
        point['codisp'] = codisp
        output.append(point)

    return output
    def anomaly_score(self, points):
        '''
        Computes anomaly score for point by inserting point in each tree in the rcrf and
        computing the average collusive displacement. Higher scores indicate a higher displacement 
        and thus a higher likelihood of anomaly. It then deletes points from each tree to 
        maintain the state of the forest. (If you want new points to be added to trees, you can 
        use the function stream_anomaly_scores with `new_forest` == False)

        Parameters:
            points:   the points on which to calculate anomaly scores. np.ndarray of size (n x d)

        Returns:
            anomaly_score: pandas Series with index of points and average collusive 
                           displacement (anomaly score) for each point
        '''
         # assert that fit function has already been called and a forest exists
        try: 
            assert(self.forest is not None)
        except:
            raise ValueError("Cannot compute anomaly scores on forest that has not been fit")

        # assert that points.shape has two dimensions
        try:
            assert(len(points.shape) is 2)
        except:
            raise ValueError("Points must have shape (n x d)")

        # assert that dimension of points is the same as dimension in fit tree
        try:
            assert(points.shape[1] == self.dimension)
        except:
            raise ValueError("Forest and points must have the same dimension. Points dim: {} vs. Forest dim: {}".format(points.shape[1], self.dimension))

        # scale mean and variance of points
        scaled_points = preprocessing.scale(points)

        avg_codisp = pd.Series(0.0, index=np.arange(points.shape[0]))
        points = rrcf_base.shingle(points, size = 1)
        for index, point in enumerate(scaled_points):
            for tree in self.forest:
                tree.insert_point(point, index='point')
                codisp = tree.codisp('point')
                avg_codisp[index] += codisp
                tree.forget_point('point')
        return avg_codisp / self.num_trees
    def stream_anomaly_scores(self, points, window_size, new_forest = False):
        '''
        Computes anomaly scores for all points in a stream by computing the average
        collusive displacement. The assumption is that each point in the stream is only observed
        sequentially. Higher scores indicate a higher displacement and thus a 
        higher likelihood of anomaly. If existing forest does not exist, or existing forest does
        exist with a different window size, create a new forest starting with the first point 
        in the stream. 

        Parameters:
            points:         the stream of point on which to calculate anomaly scores
            window_size:    the window size in which to ingest points. points are mapped as a 
                            n-dimensional window, where n = window_size
            new_forest:     boolean that identifies whether to create a new forest or not

        Returns:
            anomaly_scores: pandas Series with index of points and average collusive 
                            displacement (anomaly score) for each point
        '''

        # create a new empty forest if forest does not exit or forest does exist, but 
        # with different window size
        if self.forest is None or new_forest:
            self.num_points = 0
            forest = []
            for i in range(self.num_trees):
                tree = rrcf_base.RCTree()
                forest.append(tree)
                self.ixs[i] = []
            self.forest = forest

        # scale mean and variance of points
        #scaled_points = preprocessing.scale(points)
        #print(scaled_points.shape)
        
        # create rolling window of size window_size
        points_gen = rrcf_base.shingle(points, size=window_size)

        # calculate streaming anomaly scores
        avg_codisp = pd.Series(0.0, index=np.arange(self.num_points, self.num_points + points.shape[0]))
        initial_index = self.num_points
        for index, point in enumerate(points_gen):

            index += initial_index
            for tree_idx, tree in enumerate(self.forest):
                # If tree is above permitted size, drop the oldest point (FIFO)
                # TODO: forget oldest point or another random point with prob
                if len(tree.leaves) >= self.tree_size:
                    forget_index = min(self.ixs[tree_idx])
                    tree.forget_point(forget_index)
                    self.ixs[tree_idx] = np.delete(self.ixs[tree_idx], np.argwhere(self.ixs[tree_idx] == forget_index))
                # Insert the new point into the tree
                try:
                    tree.insert_point(point, index=index)
                    self.ixs[tree_idx] = np.append(self.ixs[tree_idx], index)
                except:
                    ValueError('failure for point {} at index {}'.format(point, index))
                # Compute codisp on the new point and take the average among all trees
                avg_codisp[index] += tree.codisp(index)

            self.num_points += 1
        return avg_codisp / self.num_trees
Beispiel #6
0
sin_test = A * np.sin(T * t - phi * T) + center
sin_test[235:255] = 80
sin_test[500:750] = 80

# Set tree parameters
num_trees = 40
shingle_size = 6
tree_size = 256

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

points = rrcf.shingle(sin, size=shingle_size)
points_test = rrcf.shingle(sin_test, size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points):
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size...
        if len(tree.leaves) > tree_size:
            # Drop the oldest point (FIFO)
            tree.forget_point(index - tree_size)

            # Drop the newest point (LIFO)
Beispiel #7
0
def test_shingle():
    shingle = rrcf.shingle(X, 3)
    step_0 = next(shingle)
    step_1 = next(shingle)
    assert (step_0[1] == step_1[0]).all()
Beispiel #8
0
lc_data = ut_lc.getDataFromFile(fileName=listFile[101],
                                height=height,
                                duration=duration)

# sketchInstances = ut_gen.genFixBin(binSize=binSize,instances=lc_data['instances'])
# notEx = ut_gen.genFixBin(binSize=binSize,instances=lc_data['instances'],isExtract=False)
#

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

# Use the "shingle" generator to create rolling window
points = rrcf.shingle(lc_data['instances'], size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points):
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size...
        if len(tree.leaves) > tree_size:
            # Drop the oldest point (FIFO)
            tree.forget_point(index - tree_size)
        # Insert the new point into the tree
        tree.insert_point(point, index=index)
        # Compute codisp on the new point...
Beispiel #9
0
    xseries = df.iloc[:, i].interpolate(method='linear').to_numpy()

    if all(np.isnan(xseries)):
        continue
    if np.isnan(xseries[0]):
        xseries[0] = xseries[1]

    #create a forest of empty trees
    forest = []
    for _ in range(num_trees):
        tree = rrcf.RCTree()
        forest.append(tree)

    #create rolling window
    points = rrcf.shingle(xseries, size=shingle_size)

    avg_codisp = {}
    for index, point in enumerate(points):
        if index % 2000 == 0:
            # if index > 16000:
            print('point' + str(index))
        # if index == 17920:
        #     raise ValueError('a')
        for tree in forest:
            #drop the oldest point (FIFO) if tree is too big
            if len(tree.leaves) > tree_size:
                tree.forget_point(index - tree_size)

            tree.insert_point(point, index=index)
plt.title("Luminol on Feature 1")
plt.show()

#%% Robust Random Cut Forest Algorithm

import rrcf
X = df["Feature1"].tolist()

TREE_COUNT = 100
SHINGLE_COUNT = 8
TREE_SIZE = 200
forest = []
for i in range(TREE_COUNT):
    tree = rrcf.RCTree()
    forest.append(tree)
points = rrcf.shingle(X, size=SHINGLE_COUNT)

avg_codisp = {}

for index, point in enumerate(points):
    for tree in forest:
        if len(tree.leaves) > TREE_SIZE:
            tree.forget_point(index - TREE_SIZE)
        tree.insert_point(point, index=index)
        if not index in avg_codisp:
            avg_codisp[index] = 0
        avg_codisp[index] += tree.codisp(index) / TREE_COUNT

time_periods = []
values = []
for key in avg_codisp:
Beispiel #11
0
# ax  = plt.subplot(111)
# plt.plot(stream_data)

# Set tree parameters
num_trees = 40
shingle_size = 4
tree_size = 256

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

# Use the "shingle" generator to create rolling window
points = rrcf.shingle(stream_data, size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points):
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size, drop the oldest point (FIFO)
        if len(tree.leaves) > tree_size:
            tree.forget_point(index - tree_size)
        # Insert the new point into the tree
        tree.insert_point(point, index=index)
        # Compute codisp on the new point and take the average among all trees
        if not index in avg_codisp:
sin = A * np.sin(T * t - phi * T) + center
sin[235:255] = 80

# Set tree parameters
num_trees = 40
shingle_size = 4
tree_size = 256

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

# Use the "shingle" generator to create rolling window
points = rrcf.shingle(sin, size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points):
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size...
        if len(tree.leaves) > tree_size:
            # Drop the oldest point (FIFO)
            tree.forget_point(index - tree_size)
        # Insert the new point into the tree
        tree.insert_point(point, index=index)
        # Compute codisp on the new point...
def anomaly_detect(df, flagdf, num_trees, shingle_size, tree_size):

    # df = z
    # Set tree parameters for robust random cut forest (rrcf)
    # num_trees = 40#40
    # shingle_size = 20
    # tree_size = 64#256
    codisps = {}
    # i=1
    # pp = list(enumerate(points))
    # index, point = pp[17920]
    # tree=forest[0]
    # np.isnan(xseries)
    # xseries.interpolate(method='linear')
    # all(x.iloc[:,i].isnull())
    # z.iloc[1,3] = np.nan
    # z.pH.interpolate()

    for i in range(df.shape[1]):
        print('var' + str(i))
        varname = df.columns[i]

        xseries = df.iloc[:,i].interpolate(method='linear').to_numpy()

        if all(np.isnan(xseries)):
            continue
        if np.isnan(xseries[0]):
            xseries[0] = xseries[1]

        #create a forest of empty trees
        forest = []
        for _ in range(num_trees):
            tree = rrcf.RCTree()
            forest.append(tree)

        #create rolling window
        points = rrcf.shingle(xseries, size=shingle_size)

        avg_codisp = {}
        for index, point in enumerate(points):
            if index % 2000 == 0:
                # if index > 16000:
                print('point' + str(index))
            # if index == 17920:
            #     raise ValueError('a')
            for tree in forest:
                #drop the oldest point (FIFO) if tree is too big
                if len(tree.leaves) > tree_size:
                    tree.forget_point(index - tree_size)

                tree.insert_point(point, index=index)

                #compute collusive displacement on the inserted point
                new_codisp = tree.codisp(index)

                #take the average codisp across all trees; that's anomaly score
                if not index in avg_codisp:
                    avg_codisp[index] = 0
                avg_codisp[index] += new_codisp / num_trees

        codisps[varname] = avg_codisp

    # c='WaterTemp_C'
    for c in list(codisps.keys()):
        avg_codisp = codisps[c]

        #get top 2% of anomaly scores; flag those points with +2
        avg_codisp_df = pd.DataFrame.from_dict(avg_codisp, orient='index',
            columns=['score'])
        thresh = float(avg_codisp_df.quantile(0.98))
        outl_inds_bool = avg_codisp_df.loc[:,'score'] > thresh
        outl_inds_int = outl_inds_bool[outl_inds_bool].index
        outl_vals = flagdf.loc[flagdf.index[outl_inds_int], c]
        flagdf.loc[flagdf.index[outl_inds_int], c] = outl_vals + 2

        df.loc[df.index[outl_inds_int], varname] = np.nan

        # outl_inds = avg_codisp_df[outl_inds_bool]
        # outl = pd.merge(outl_inds, pd.DataFrame(xseries, columns=['val']), how='left', left_index=True,
        #     right_index=True)

    return (df, flagdf)
Beispiel #14
0
print(df.head())

# Set forest parameters
num_trees = 100
tree_size = 256
shingle_size = 6
n = df.shape[0]
sample_size_range = (n // tree_size, tree_size)

# Create a forest of empty trees
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

points = rrcf.shingle(ndf, size=shingle_size)

# Create a dict to store anomaly score of each point
avg_codisp = {}

# For each shingle...
for index, point in enumerate(points):
    # For each tree in the forest...
    for tree in forest:
        # If tree is above permitted size...
        if len(tree.leaves) > tree_size:
            # Drop the oldest point (FIFO)
            tree.forget_point(index - tree_size)
        # Insert the new point into the tree
        tree.insert_point(point, index=index)
        # Compute codisp on the new point...
Beispiel #15
0
print()
print("Total puntos con anomalias:", totalPtosAnomalia )

#Establecer parametros de arbol
num_trees = 20
shingle_size = 2
tree_size = 15

#Crear un bosque de arboles vacios
forest = []
for _ in range(num_trees):
    tree = rrcf.RCTree()
    forest.append(tree)

#Se usa el generador de "shingle" para crear una ventana movil
pointsTrain = rrcf.shingle(DataTrainWiTri, size=shingle_size)
pointsTest = rrcf.shingle(DataTestWiTri, size=shingle_size)

#Crear un dict para almacenar el puntaje de anomalía de cada punto
avg_codisp = {}
avg_codispTest = {}

#Arbol para los datos de entrenamiento
def train (forest):
    print("TRAIN")
    # Por cada shingle ...
    for index, point in enumerate(pointsTrain):
        #Por cada arbol en el bosque...
        for tree in forest:
            #Inserta el nuevo punto en el árbol
            tree.insert_point(point, index=index)
Beispiel #16
0
                      '2015-01-02 00:00:00'),
'blizzard'         : ('2015-01-26 00:00:00',
                      '2015-01-28 00:00:00')
}
taxi['event'] = np.zeros(len(taxi))
for event, duration in events.items():
    start, end = duration
    taxi.loc[start:end, 'event'] = 1

# Set tree parameters
num_trees = 200
shingle_size = 48
tree_size = 1000

# Use the "shingle" generator to create rolling window
points = rrcf.shingle(data, size=shingle_size)
points = np.vstack([point for point in points])
n = points.shape[0]
sample_size_range = (n // tree_size, tree_size)

forest = []
while len(forest) < num_trees:
    ixs = np.random.choice(n, size=sample_size_range,
                           replace=False)
    trees = [rrcf.RCTree(points[ix], index_labels=ix)
             for ix in ixs]
    forest.extend(trees)

avg_codisp = pd.Series([0.0]*n, index=np.arange(n))
index = np.zeros(n)