/
NetFlow_ANN.py
136 lines (136 loc) · 6.69 KB
/
NetFlow_ANN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import preprocessdump
from pybrain.tools.shortcuts import buildNetwork
from pybrain.datasets import SupervisedDataSet,ClassificationDataSet
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure import LinearLayer, SigmoidLayer
from pybrain.tools.customxml.networkwriter import NetworkWriter
from pybrain.tools.customxml.networkreader import NetworkReader
from pybrain.structure.modules import SoftmaxLayer
from pybrain.utilities import percentError
import sys
import os
import getopt
import cPickle as pickle
from cStringIO import StringIO
import math
from random import randint
#input data format record_dict=dict({"ip":ip,"duration":record_array[4],"flow":record_array[7],"packets":record_array[8],"bytes":record_array[9],"pps":record_array[10],"bps":record_array[11],"bpp":record_array[12].replace('\n','')})
class NetFlow_ANN:
def __init__(self):
print "start a new instance"
self.loaded=False
self.has_data_source=False
try:
self.net=NetworkReader.readFrom('pickled_ANN')
print "ANN has been found from an ash jar"
self.loaded=True
except IOError:
print "ash jar is empty, use train() to start a new ANN"
def normalize_data(self, source, base):
#print source
#print "=========================="
#print base
return [self.normalize_f(float(source["flow"])/float(base["flow"])),self.normalize_f(float(source["packets"])/float(base["packets"])),self.normalize_f(float(source["pps"])/float(base["pps"])),self.normalize_f(float(source["bpp"])/float(base["bpp"])),self.normalize_f(float(source["bps"])/float(base["bps"]))]
#return [float(source["flow"])/float(base["flow"]),float(source["packets"])/float(base["packets"]),float(source["pps"])/float(base["pps"]),float(source["bpp"])/float(base["bpp"]),float(source["bps"])/float(base["bps"])]
#return [self.normalize_f(float(source["flow"])),self.normalize_f(float(source["packets"])),self.normalize_f(float(source["pps"])),self.normalize_f(float(source["bpp"])),self.normalize_f(float(source["bps"]))]
def normalize_f(self,x):
#return 1.7159*math.tan(math.radians(float(2/3*x)))
#return 1.7159*math.tan(math.degrees(float(2/3*x)))
return (math.exp(x)-math.exp(-x))/(math.exp(x)+math.exp(-x))
def build_data_set(self,sourcefile,sus_example_count=100,non_sus_example_count=100):
self.source_file=sourcefile
self.data_source=preprocessdump.datasource(sourcefile)
self.has_data_source=True
self.normalize_base=self.data_source.load_base_from_file(sourcefile)
#self.data_set=SupervisedDataSet(5,1)
self.data_set=ClassificationDataSet(5,1,nb_classes=2,class_labels=['good','p2p'])
count=0
#for sus_example in self.data_source.get_sus_dict(sus_example_count):
for sus_example in self.data_source.load_dump_from_file(sourcefile+"_sus"):
self.data_set.addSample(self.normalize_data(sus_example,self.normalize_base) , 1)
if count*100/sus_example_count%10==0:
progress=count*100/sus_example_count
print '\r[{0}] {1}%'.format('#'*(progress/10), progress),
if count==sus_example_count:
break
count+=1
print "\n %d suspicious examples added" % count
count=0
for normal_example in self.data_source.load_dump_from_file(sourcefile+"_non_sus"):
#for normal_example in self.data_source.get_non_sus_dict(non_sus_example_count):
self.data_set.addSample(self.normalize_data(normal_example,self.normalize_base) , 0)
if count*100/non_sus_example_count%10==0:
progress=count*100/non_sus_example_count
print '\r[{0}] {1}%'.format('#'*(progress/10), progress),
if count==non_sus_example_count:
break
count+=1
print "\n %d non_suspicious examples added" % count
print "Training data ready"
print "Number of training patterns: ", len(self.data_set)
print "Input and output dimensions: ", self.data_set.indim, self.data_set.outdim
#for x in range(0,len(self.data_set)-1):
# print "%d sample (input, target, class):" % x
# print self.data_set['input'][x], self.data_set['target'][x]#, self.data_set['class'][0]
def training(self):
train_set,valid_set=self.data_set.splitWithProportion(0.9)
#train_set,valid_set=example_data.splitWithProportion(0.9)
valid_set._convertToOneOfMany()
train_set._convertToOneOfMany()
print train_set.calculateStatistics()
if not self.loaded:
#self.net=buildNetwork(6,20,1,outclass=SoftmaxLayer)
self.net=buildNetwork(train_set.indim,train_set.outdim,recurrent=False)
self.net.name='PieBrain'
print "A new ANN has been created"
trainer=BackpropTrainer(self.net,train_set,momentum=0.1, weightdecay=0.01 )
#trainer=BackpropTrainer(self.net,self.data_set)
trainer.trainUntilConvergence()
#trainer.trainEpochs(500)
print 'training is over'
#choose a random testing instance
ins=randint(0,len(valid_set)-1)
print "testing instace = "+str(valid_set['input'][ins])
print "expecting result = "+str(valid_set['target'][ins])
print "get = "+str(self.net.activate(valid_set['input'][ins]))
def set_datasource(self,sourcefile):
self.data_source=sourcefile
def classifier(self,input,count):
#get (count) number of ips from data source
finished=0
result=dict()
for example in input:
if finished==count:
return
print example
input_data=self.normalize_data(example,self.normalize_base)
normal,sus= self.net.activate(input_data)
likelihood=sus-normal
#print '{:d} / {:d}'.format(finished,count)
print str(finished)+"/"+str(count)
print example['ip']+":"+ str(likelihood)
result[example['ip']]=likelihood
finished+=1
return result
def loader(argv=sys.argv):
""" doc is here
"""
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
#print __doc__
print "input format:'input nfdumpfile' 'output iplist'"
sys.exit(0)
ANN=NetFlow_ANN()
ANN.build_data_set(args[0],100,2000)
ANN.training()
NetworkWriter.writeToFile(ANN.net,'pickled_ANN')
if __name__=="__main__":
sys.exit(loader())