def main(configuration): totalExecutionTime = [] for dataset in configuration['dataset']['availableDataset']: print(dataset + '\n') truth = groundTruth('GrounTruth/' + dataset + '.txt', fileType='csv') for node in configuration['nodes']: print('Dataset {} - Node: {} loading ...'.format(dataset, node)) df = pd.read_csv(configuration['dataset']['path'] + node + dataset + '.csv', low_memory=False).dropna() \ .drop('Unnamed: 0', axis=1) print('Done.') times = df['time'].astype('int') df = df.drop(['time'], axis=1) # Without bravo df = df.loc[:, df.std() != 0] dfNormalized = normalize_matrix(df).dropna(axis=1) bufferDF = dfNormalized[0: configuration['sampleSkip']] testDF = dfNormalized[configuration['sampleSkip']:] # Anomaly DenStream initialization with the parameters in the configuration file aden = DenStream(lamb=configuration['denstreamParameters']['lambda'], epsilon=configuration['denstreamParameters']['epsilon'], beta=configuration['denstreamParameters']['beta'], mu=configuration['denstreamParameters']['mu'], startingBuffer=bufferDF, tp=configuration['denstreamParameters']['tp']) aden.runInitialization() print('Running algorithm ...') outputCurrentNode = [] startingSimulation = time.time() for sampleNumber in range(len(testDF)): sample = testDF.iloc[sampleNumber] result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber])) outputCurrentNode.append(result) endSimulation = time.time() - startingSimulation totalExecutionTime.append(endSimulation) print('Done in {}'.format(endSimulation)) df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode print("Number of anomalies in " + str(node) + " is: ", outputCurrentNode.count(True), len(outputCurrentNode)) if configuration['detectionCriterion'] == 'spatialDetection': df['time'] = times df[['result', 'time']].to_csv('Data/ResultsSpatialDetection/' + configuration[ 'featureModel'] + '/' + dataset + '_DENSTREAM_' + node + '.csv', sep=',') return aden, truth, df, times, dfNormalized, testDF
def main(configuration): resultByNode = {} totalExecutionTime = [] for dataset in configuration['dataset']['list']: """Iterate on all the datasets chosen in the configuration list and read the ground truth file""" truth = groundTruth('GrounTruth/'+dataset+'.txt', fileType='csv') """Iterate on all the nodes chosen in the configuration file""" for node in configuration['nodes']: """Read node dataset""" print 'Dataset {} - Node: {} loading ...'.format(dataset,node), df = pd.read_csv(configuration['dataset']['path']+node+dataset+'.csv', low_memory = False)\ .dropna()\ .drop('Unnamed: 0', axis=1) print 'Done.' times = df['time'].astype('int') df = df.drop(['time'], axis=1) """Select the chosen features in the configuration file""" """By default the dataset contains all the features""" """If ControlPlane is chosen: only the CP features are extracted from the dataset""" """If DataPlane is chosen: the CP features are discarded, obtaining a dataset with only DataPlane""" """If CompleteFeatures is chosen: pass""" if configuration['featureModel'] == 'ControlPlane': df = df[configuration['featureList']] elif configuration['featureModel'] == 'DataPlane': df = df.drop(configuration['featureList'], axis=1) elif configuration['featureModel'] == 'CompleteFeatures': pass else: sys.exit('Something wrong in configuration feature model') """Dataset normalization""" df = df.loc[:,df.std()!=0] dfNormalized = normalize_matrix(df).dropna(axis=1) bufferDF = dfNormalized[0:configuration['sampleSkip']] testDF = dfNormalized[configuration['sampleSkip']:] """Anomaly DenStream initialization with the parameters in the configuration file""" aden = DenStream(lamb = configuration['denstreamParameters']['lambda'],\ epsilon = configuration['denstreamParameters']['epsilon'],\ beta = configuration['denstreamParameters']['beta'],\ mu = configuration['denstreamParameters']['mu'],\ startingBuffer = bufferDF, tp = configuration['denstreamParameters']['tp']) aden.runInitialization() """Iterate on all the rows in the dataset and run .runOnNewSample() method of the algorithm""" """The algorithm tries to merge the new sample to the existing clusters""" """If the algorithm merges the sample to a core-mmc: the sample is considered Normal and returns False""" """If the algorithm merges the sample to a outlier-mc or generates a new outlier-mc: the sample is considered Anomalous and returns True""" print 'Running algorithm ...', startingSimulation = time.time() outputCurrentNode = [] for sampleNumber in range(len(testDF)): sample = testDF.iloc[sampleNumber] result = aden.runOnNewSample(Sample(sample.values, times.iloc[sampleNumber])) outputCurrentNode.append(result) ### END Running ### endSimulation = time.time() - startingSimulation totalExecutionTime.append(endSimulation) print 'Done in {}'.format(endSimulation) df['result'] = [False] * configuration['sampleSkip'] + outputCurrentNode """Depending on the detection criterion chosen in the configuration file the script produces:""" """1- Results and statistics compared to grountruth if timedetection chosen""" """2- Results for each node if spatialdetection chosen. To compare the results with the groundtruth there is the need to run spatialPerformance.py""" if configuration['detectionCriterion'] == 'spatialDetection': df['time'] = times df[['result','time']].to_csv('Data/ResultsSpatialDetection/'+configuration['featureModel']+'/'+dataset+'_DENSTREAM_'+node+'.csv', sep=',') elif configuration['detectionCriterion'] == 'timeDetection': statistics = Statistics(node, truth) resultByNode[node+dataset] = statistics.getNodeResult(df, times, kMAX=5) else: sys.exit('Error detectionCriterion') """ Print result on file if multicoreAnalysis ON. Used only for grid optimization. Very long task""" if configuration['multicoreAnalysis']['ON'] == 'YES': path = "DataPlane/" with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_PRF.json", "w") as outputfile: json.dump(statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=False), outputfile, indent=4, sort_keys=True) resultdelay = statistics.getDelay(resultByNode, kMAX=5, plot=False) record = {} for row in range(len(resultdelay[0])): record['k'+str(row+1)] = list(resultdelay[0][row]) for row in range(len(resultdelay[1])): record['hop'+str(row)] = list(resultdelay[1]['hop'+str(row)]) with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_delay.json", "w") as outputfile: json.dump(record, outputfile, indent=4, sort_keys=True) with open("Results/"+path+str(configuration["algorithmParameters"]["lambda"])+"_"+str(configuration["algorithmParameters"]["beta"])+"_execTime.json", "w") as outputfile: json.dump({'execTime':totalExecutionTime}, outputfile, indent=4, sort_keys=True) else: """Compute statistics if time detection chosen""" """The script compares the results with the ground truth and computes precision/recall""" """In the end, writes the results on "resultsKT.json" file, in the "Visualiation" folder""" if configuration['detectionCriterion'] == 'timeDetection': resStatistics = statistics.getPrecisionRecallFalseRate(resultByNode, kMAX=5, plot=True) resDelay = statistics.getDelay(resultByNode, kMAX=5, plot=True) print resStatistics print resDelay resStatistics['Delay'] = resDelay[0][:,0].tolist() resStatistics['errDelay'] = resDelay[1]['hop0'].tolist() with open('Visualization/resultsKT_'+configuration['featureModel']+'.json', 'w') as outfile: json.dump(resStatistics, outfile, indent=2) print 'Time: {}'.format(np.sum(totalExecutionTime)) """return all the variables""" return aden, truth, df, times, dfNormalized
# make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) denstream = DenStream(eps=0.3, lambd=0.1, beta=0.5, mu=11) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference'], random_state=None) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('DenStream', denstream), ('MiniBatchKMeans', two_means),
def worker(beta): lambdas = np.arange(0.02, 0.03, 0.01) lambdas = [0.03] results = [] for lamb in lambdas: resultSimulation = {} for simulationDataset in ['BGP_CLEAR']: for node in nodes: if simulationDataset == 'PortFlap': df = pd.read_csv('Data/'+node+'base_no_traffic.csv').dropna()\ .drop('Unnamed: 0', axis=1)\ # .drop('MgmtEth0/RP0/CPU0/0reliability', axis=1)\ # .drop('MgmtEth0/RP0/CPU0/0packets-sent', axis=1) elif simulationDataset == 'BGP_CLEAR': ### DATA PLANE df = pd.read_csv('Data/'+node+'_clearbgp.csv').dropna()\ .drop('Unnamed: 0', axis=1)\ # .drop('MgmtEth0/RP0/CPU0/0packets-sent', axis=1)\ # .drop('HundredGigE0/0/0/30packets-sent', axis=1) elif simulationDataset == 'BGP_CLEAR2': ### DATA PLANE df = pd.read_csv('Data/'+node+'bgp_clear_NoDataPlane.csv').dropna()\ .drop('Unnamed: 0', axis=1)\ else: print 'CHECK DATASET' df = df[:500] df['time'] = df['time'] / 1000000 times = df['time'] df = df.drop(['time'], axis=1) featuresDrop = [] for feature in df.columns: if 'MgmtEth' in feature: featuresDrop.append(feature) if 'reliability' in feature: featuresDrop.append(feature) if 'Hundred' in feature: featuresDrop.append(feature) df = df.drop(featuresDrop, axis=1) if 'cluster' in df: df = df.drop('cluster', axis=1) dfNormalized = normalize_matrix(df).dropna(axis=1) sampleSkip = 41 bufferDf = dfNormalized[0:sampleSkip] testDf = dfNormalized[sampleSkip:] den = DenStream(lamb=lamb, epsilon='auto', beta=beta, mu='auto', startingBuffer=bufferDf, tp=12) den.runInitialization() outputNEW = [] startingSimulation = time.time() for sampleNumber in range(len(testDf)): sample = testDf.iloc[sampleNumber] result = den.runOnNewSample( Sample(sample.values, times.iloc[sampleNumber])) outputNEW.append(result) ### END SIMULATION ### print time.time() - startingSimulation df['result'] = [False] * sampleSkip + outputNEW truth = groundTruth() if simulationDataset == 'PortFlap': truth.simulationPORT_FLAP() truth.simulationPORT_FLAP_CLEAR() elif simulationDataset == 'BGP_CLEAR': truth.simulationBGP_CLEAR3_TwoMin() truth.simulationBGP_CLEAR_CLEAR() elif simulationDataset == 'BGP_CLEAR2': truth.simulationBGP_CLEAR_Second_DATASET() truth.simulationBGP_CLEAR2_CLEAR() # visual=Visualization() # visual.plotResults(df, times, truth) statistics = Statistics(node, truth) resultSimulation[node + simulationDataset] = statistics.getNodeResult( df, times, kMAX=5) statistics.getPrecisionRecallFalseRate(resultSimulation, kMAX=5, plot=True) statistics.getDelay(resultSimulation, kMAX=5, plot=True) # result = { # 'PRF': statistics.getPrecisionRecallFalseRate(resultSimulation, kMAX=5, plot=False), # 'Delay': statistics.getDelay(resultSimulation, kMAX=5, plot=False) # } # results.append(statistics.getPrecisionRecallFalseRate(resultSimulation, kMAX=5, plot=True)) # filename = 'beta'+str(beta) # df = pd.DataFrame(results) # df.to_csv(filename+'.csv') # filename = 'beta'+str(beta) # import json # with open('Results/'+filename+'.txt', 'w') as outfile: # json.dump({beta:results}, outfile) return results
# # # df = df.drop(deleteFeatureList, axis=1) times = df['time'] df = df.drop(['time'], axis=1) dfNormalized = normalize_matrix(df).dropna(axis=1) sampleSkip = 40 bufferDf = dfNormalized[0:sampleSkip] testDf = dfNormalized[sampleSkip:] den = DenStream(lamb=0.03, epsilon='auto', beta=0.03, mu='auto', startingBuffer=bufferDf, tp=36) den.runInitialization() outputCurrentNode = [] startingSimulation = time.time() for sampleNumber in range(len(testDf)): sample = testDf.iloc[sampleNumber] result = den.runOnNewSample( Sample(sample.values, times.iloc[sampleNumber])) outputCurrentNode.append(result) ### END SIMULATION ### print time.time() - startingSimulation df['result'] = [False] * sampleSkip + outputCurrentNode
import math import numpy as np import pandas as pd from DenStream import DenStream data = pd.read_csv("seqs.csv", header=None) intervals = set() for (x, y), val in np.ndenumerate(data): intervals.add(val) print(f"intervals: {intervals} interval len: {len(intervals)}") outliers = set() centers = set() clusterer = DenStream(lambd=0.1, eps=10, beta=0.5, mu=10) for (x, y), val in np.ndenumerate(data): label = clusterer.fit_predict([np.array([val])])[0] if label == -1: outliers.add(val) # print(f"prediction: {val}") else: centers.add(int(round(clusterer.p_micro_clusters[label].center()[0]))) # print(f"prediction: {clusterer.p_micro_clusters[label].center()}") p_micro_cluster_centers = np.array([ p_micro_cluster.center() for p_micro_cluster in clusterer.p_micro_clusters ]) print(f"p_micro_centers: {p_micro_cluster_centers}")