コード例 #1
0
def main(configuration):
    totalExecutionTime = []

    for dataset in configuration['dataset']['availableDataset']:
        print(dataset + '\n')

        truth = groundTruth('GrounTruth/' + dataset + '.txt', fileType='csv')

        for node in configuration['nodes']:
            print('Dataset {} - Node: {} loading ...'.format(dataset, node))

            df = pd.read_csv(configuration['dataset']['path'] + node + dataset + '.csv', low_memory=False).dropna() \
                .drop('Unnamed: 0', axis=1)
            print('Done.')

            times = df['time'].astype('int')
            df = df.drop(['time'], axis=1)

            # Without bravo
            df = df.loc[:, df.std() != 0]
            dfNormalized = normalize_matrix(df).dropna(axis=1)

            bufferDF = dfNormalized[0: configuration['sampleSkip']]
            testDF = dfNormalized[configuration['sampleSkip']:]

            # Anomaly DenStream initialization with the parameters in the configuration file
            aden = DenStream(lamb=configuration['denstreamParameters']['lambda'],
                             epsilon=configuration['denstreamParameters']['epsilon'],
                             beta=configuration['denstreamParameters']['beta'],
                             mu=configuration['denstreamParameters']['mu'],
                             startingBuffer=bufferDF,
                             tp=configuration['denstreamParameters']['tp'])
            aden.runInitialization()

            print('Running algorithm ...')
            outputCurrentNode = []
            startingSimulation = time.time()
            for sampleNumber in range(len(testDF)):
                sample = testDF.iloc[sampleNumber]
                result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber]))
                outputCurrentNode.append(result)
            endSimulation = time.time() - startingSimulation
            totalExecutionTime.append(endSimulation)
            print('Done in {}'.format(endSimulation))

            df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode

            print("Number of anomalies in " + str(node) + " is: ", outputCurrentNode.count(True),
                  len(outputCurrentNode))

            if configuration['detectionCriterion'] == 'spatialDetection':
                df['time'] = times
                df[['result', 'time']].to_csv('Data/ResultsSpatialDetection/' + configuration[
                    'featureModel'] + '/' + dataset + '_DENSTREAM_' + node + '.csv', sep=',')
    return aden, truth, df, times, dfNormalized, testDF
コード例 #2
0
def main(configuration):

	resultByNode = {}

	totalExecutionTime = []

	for dataset in configuration['dataset']['list']:

		"""Iterate on all the datasets chosen in the configuration list and read the ground truth file"""
		truth = groundTruth('GrounTruth/'+dataset+'.txt', fileType='csv')

		"""Iterate on all the nodes chosen in the configuration file"""
		for node in configuration['nodes']:

			"""Read node dataset"""
			print 'Dataset {} - Node: {} loading ...'.format(dataset,node),
			df = pd.read_csv(configuration['dataset']['path']+node+dataset+'.csv', low_memory = False)\
							.dropna()\
							.drop('Unnamed: 0', axis=1)
			print 'Done.'

			times = df['time'].astype('int')
			df = df.drop(['time'], axis=1)

			"""Select the chosen features in the configuration file"""
			"""By default the dataset contains all the features"""
			"""If ControlPlane is chosen: only the CP features are extracted from the dataset"""
			"""If DataPlane is chosen: the CP features are discarded, obtaining a dataset with only DataPlane"""
			"""If CompleteFeatures is chosen: pass"""
			if configuration['featureModel'] == 'ControlPlane':
				df = df[configuration['featureList']]
			elif configuration['featureModel'] == 'DataPlane':
				df = df.drop(configuration['featureList'], axis=1)
			elif configuration['featureModel'] == 'CompleteFeatures':
				pass
			else:
				sys.exit('Something wrong in configuration feature model')

			"""Dataset normalization"""	
			df = df.loc[:,df.std()!=0]
			dfNormalized = normalize_matrix(df).dropna(axis=1)
           
			bufferDF = dfNormalized[0:configuration['sampleSkip']]
			testDF = dfNormalized[configuration['sampleSkip']:]

			"""Anomaly DenStream initialization with the parameters in the configuration file"""
			aden = DenStream(lamb = configuration['denstreamParameters']['lambda'],\
							epsilon = configuration['denstreamParameters']['epsilon'],\
							beta = configuration['denstreamParameters']['beta'],\
							mu = configuration['denstreamParameters']['mu'],\
							startingBuffer = bufferDF,
							tp = configuration['denstreamParameters']['tp'])
			aden.runInitialization()

			"""Iterate on all the rows in the dataset and run .runOnNewSample() method of the algorithm"""
			"""The algorithm tries to merge the new sample to the existing clusters"""
			"""If the algorithm merges the sample to a core-mmc: the sample is considered Normal and returns False"""
			"""If the algorithm merges the sample to a outlier-mc or generates a new outlier-mc: the sample is considered Anomalous and returns True""" 
			print 'Running algorithm ...',
			startingSimulation = time.time()
			outputCurrentNode = []
			for sampleNumber in range(len(testDF)):
				sample = testDF.iloc[sampleNumber]
				result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber]))
				outputCurrentNode.append(result)
			### END Running ###
			endSimulation = time.time() - startingSimulation
			totalExecutionTime.append(endSimulation)
			print 'Done in {}'.format(endSimulation)

			df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode

			"""Depending on the detection criterion chosen in the configuration file the script produces:"""
			"""1- Results and statistics compared to grountruth if timedetection chosen"""
			"""2- Results for each node if spatialdetection chosen. To compare the results with the groundtruth there is the need to run spatialPerformance.py"""
			if configuration['detectionCriterion'] == 'spatialDetection':
				df['time'] =  times
				df[['result','time']].to_csv('Data/ResultsSpatialDetection/'+configuration['featureModel']+'/'+dataset+'_DENSTREAM_'+node+'.csv', sep=',')

			elif configuration['detectionCriterion'] == 'timeDetection':
				statistics = Statistics(node, truth)
				resultByNode[node+dataset] = statistics.getNodeResult(df, times, kMAX=5)
			else:
				sys.exit('Error detectionCriterion')

	"""	Print result on file if multicoreAnalysis ON. Used only for grid optimization. Very long task""" 
	if configuration['multicoreAnalysis']['ON'] == 'YES':
		path = "DataPlane/"
		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_PRF.json", "w") as outputfile:
			json.dump(statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=False), outputfile, indent=4, sort_keys=True)

		resultdelay = statistics.getDelay(resultByNode, kMAX=5, plot=False)

		record = {}

		for row in range(len(resultdelay[0])):
			record['k'+str(row+1)] = list(resultdelay[0][row])

		for row in range(len(resultdelay[1])):
			record['hop'+str(row)] = list(resultdelay[1]['hop'+str(row)])

		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_delay.json", "w") as outputfile:
			json.dump(record, outputfile, indent=4, sort_keys=True)

		with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_execTime.json", "w") as outputfile:
			json.dump({'execTime':totalExecutionTime}, outputfile, indent=4, sort_keys=True)

	else:
		"""Compute statistics if time detection chosen"""
		"""The script compares the results with the ground truth and computes precision/recall"""
		"""In the end, writes the results on "resultsKT.json" file, in the "Visualiation" folder""" 
		if configuration['detectionCriterion'] == 'timeDetection':
			resStatistics =  statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=True)
			resDelay =  statistics.getDelay(resultByNode, kMAX=5, plot=True)

			print resStatistics
			print resDelay

			resStatistics['Delay'] = resDelay[0][:,0].tolist()
			resStatistics['errDelay'] = resDelay[1]['hop0'].tolist()

			with open('Visualization/resultsKT_'+configuration['featureModel']+'.json', 'w') as outfile:
			    json.dump(resStatistics, outfile, indent=2)

			print 'Time: {}'.format(np.sum(totalExecutionTime))

		"""return all the variables"""
		return aden, truth, df, times, dfNormalized
コード例 #3
0
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    denstream = DenStream(eps=0.3, lambd=0.1, beta=0.5, mu=11)
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'],
        preference=params['preference'],
        random_state=None)
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')

    clustering_algorithms = (('DenStream', denstream), ('MiniBatchKMeans',
                                                        two_means),
コード例 #4
0
def worker(beta):

    lambdas = np.arange(0.02, 0.03, 0.01)
    lambdas = [0.03]

    results = []
    for lamb in lambdas:

        resultSimulation = {}

        for simulationDataset in ['BGP_CLEAR']:

            for node in nodes:

                if simulationDataset == 'PortFlap':

                    df = pd.read_csv('Data/'+node+'base_no_traffic.csv').dropna()\
                            .drop('Unnamed: 0', axis=1)\
#                            .drop('MgmtEth0/RP0/CPU0/0reliability', axis=1)\

#                            .drop('MgmtEth0/RP0/CPU0/0packets-sent', axis=1)

                elif simulationDataset == 'BGP_CLEAR':
                    ### DATA PLANE
                    df = pd.read_csv('Data/'+node+'_clearbgp.csv').dropna()\
                            .drop('Unnamed: 0', axis=1)\
#                            .drop('MgmtEth0/RP0/CPU0/0packets-sent', axis=1)\

                #            .drop('HundredGigE0/0/0/30packets-sent', axis=1)
                elif simulationDataset == 'BGP_CLEAR2':
                    ### DATA PLANE
                    df = pd.read_csv('Data/'+node+'bgp_clear_NoDataPlane.csv').dropna()\
                            .drop('Unnamed: 0', axis=1)\

                else:
                    print 'CHECK DATASET'

                df = df[:500]

                df['time'] = df['time'] / 1000000
                times = df['time']
                df = df.drop(['time'], axis=1)

                featuresDrop = []
                for feature in df.columns:
                    if 'MgmtEth' in feature:
                        featuresDrop.append(feature)
                    if 'reliability' in feature:
                        featuresDrop.append(feature)
                    if 'Hundred' in feature:
                        featuresDrop.append(feature)

                df = df.drop(featuresDrop, axis=1)

                if 'cluster' in df:
                    df = df.drop('cluster', axis=1)

                dfNormalized = normalize_matrix(df).dropna(axis=1)

                sampleSkip = 41
                bufferDf = dfNormalized[0:sampleSkip]
                testDf = dfNormalized[sampleSkip:]

                den = DenStream(lamb=lamb,
                                epsilon='auto',
                                beta=beta,
                                mu='auto',
                                startingBuffer=bufferDf,
                                tp=12)
                den.runInitialization()

                outputNEW = []
                startingSimulation = time.time()
                for sampleNumber in range(len(testDf)):
                    sample = testDf.iloc[sampleNumber]
                    result = den.runOnNewSample(
                        Sample(sample.values, times.iloc[sampleNumber]))
                    outputNEW.append(result)
                ### END SIMULATION ###
                print time.time() - startingSimulation

                df['result'] = [False] * sampleSkip + outputNEW

                truth = groundTruth()
                if simulationDataset == 'PortFlap':
                    truth.simulationPORT_FLAP()
                    truth.simulationPORT_FLAP_CLEAR()
                elif simulationDataset == 'BGP_CLEAR':
                    truth.simulationBGP_CLEAR3_TwoMin()
                    truth.simulationBGP_CLEAR_CLEAR()
                elif simulationDataset == 'BGP_CLEAR2':
                    truth.simulationBGP_CLEAR_Second_DATASET()
                    truth.simulationBGP_CLEAR2_CLEAR()

        #        visual=Visualization()
        #        visual.plotResults(df, times, truth)

                statistics = Statistics(node, truth)
                resultSimulation[node +
                                 simulationDataset] = statistics.getNodeResult(
                                     df, times, kMAX=5)

        statistics.getPrecisionRecallFalseRate(resultSimulation,
                                               kMAX=5,
                                               plot=True)
        statistics.getDelay(resultSimulation, kMAX=5, plot=True)

        #    result = {
        #        'PRF': statistics.getPrecisionRecallFalseRate(resultSimulation, kMAX=5, plot=False),
        #        'Delay': statistics.getDelay(resultSimulation, kMAX=5, plot=False)
        #    }


#        results.append(statistics.getPrecisionRecallFalseRate(resultSimulation, kMAX=5, plot=True))

#    filename = 'beta'+str(beta)
#    df = pd.DataFrame(results)
#    df.to_csv(filename+'.csv')

#    filename = 'beta'+str(beta)
#    import json
#    with open('Results/'+filename+'.txt', 'w') as outfile:
#       json.dump({beta:results}, outfile)

    return results
コード例 #5
0
    #
    #
    #    df = df.drop(deleteFeatureList, axis=1)

    times = df['time']
    df = df.drop(['time'], axis=1)

    dfNormalized = normalize_matrix(df).dropna(axis=1)

    sampleSkip = 40
    bufferDf = dfNormalized[0:sampleSkip]
    testDf = dfNormalized[sampleSkip:]

    den = DenStream(lamb=0.03,
                    epsilon='auto',
                    beta=0.03,
                    mu='auto',
                    startingBuffer=bufferDf,
                    tp=36)
    den.runInitialization()

    outputCurrentNode = []
    startingSimulation = time.time()
    for sampleNumber in range(len(testDf)):
        sample = testDf.iloc[sampleNumber]
        result = den.runOnNewSample(
            Sample(sample.values, times.iloc[sampleNumber]))
        outputCurrentNode.append(result)
    ### END SIMULATION ###
    print time.time() - startingSimulation

    df['result'] = [False] * sampleSkip + outputCurrentNode
コード例 #6
0
ファイル: test_stream.py プロジェクト: scikit-ika/scikit-ika
import math
import numpy as np
import pandas as pd
from DenStream import DenStream

data = pd.read_csv("seqs.csv", header=None)

intervals = set()
for (x, y), val in np.ndenumerate(data):
    intervals.add(val)
print(f"intervals: {intervals} interval len: {len(intervals)}")

outliers = set()
centers = set()

clusterer = DenStream(lambd=0.1, eps=10, beta=0.5, mu=10)
for (x, y), val in np.ndenumerate(data):
    label = clusterer.fit_predict([np.array([val])])[0]

    if label == -1:
        outliers.add(val)
        # print(f"prediction: {val}")
    else:
        centers.add(int(round(clusterer.p_micro_clusters[label].center()[0])))
        # print(f"prediction: {clusterer.p_micro_clusters[label].center()}")

p_micro_cluster_centers = np.array([
    p_micro_cluster.center() for p_micro_cluster in clusterer.p_micro_clusters
])

print(f"p_micro_centers: {p_micro_cluster_centers}")