/
combined-ncdspam.py
executable file
·196 lines (156 loc) · 5.71 KB
/
combined-ncdspam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#######################################
# combined-ncdspam.py
# Functionality:
#
# 1. Use Cilibrasi and Vitanyi's Normalized Compression Distance
# to cluster a dataset of packet profiles.
# Author: Fahim Abbasi 20110316
# http://seat.massey.ac.nz/projects/honeynet
# Code based on wjt
# http://digitalhistoryhacks.blogspot.com
# 2. Calculate spamsum
# 3. Generate Graph
# 4. Generate UPGMA graph
# 5. Write combined spamsum + ncd score to file
#
# 20111205:
# Changed 'location' variable to be provided as command line parameter
# UPGMA file was redundant as the NCD output file is the same thing
# 20111220:
# Take 2 files as input and calculate NCD and spamsum scores and write
# the combined score to the result file
#
######################################
#!/usr/bin/env python
import re, math, collections, sys , jellyfish
import os
import random
import zlib
import time
from decimal import *
import spamsum
def tokenize( _str):
stopwords = [',']
tokens = collections.defaultdict(lambda: 0.)
for m in re.finditer(r"(\w+)", _str, re.UNICODE):
# m = m.group(1).lower()
m = m.group(1)
if len(m) < 2: continue
if m in stopwords: continue
tokens[m] += 1
return tokens
#end of tokenize
#pathstring = '/home/fimz/Dev/profiler/worm/exp-20110121-normal-n-worm/dataset/ncd'
## Function to calculate the Levenshtein, D-lev, Hamming, Jaro, Jaro-wink distance and KL and spamsum of the two files ##
def alldist(filex, filey):
xread = open(filex, 'r').read()
yread = open(filey, 'r').read()
xhash = spamsum.spamsum(xread)
yhash = spamsum.spamsum(yread)
spsum = spamsum.match(xhash,yhash)
spsum = 100 - spsum
spsum = float(spsum/100.00)
return spsum
## Function to calculate the NCD of two files based on zlib level 9 compression ##
def ncd(filex, filey):
xbytes = open(filex, 'r').read()
ybytes = open(filey, 'r').read()
xybytes = xbytes + ybytes
cx = zlib.compress(xbytes,9)
cy = zlib.compress(ybytes,9)
cxy = zlib.compress(xybytes)
if len(cy) > len(cx):
n = (len(cxy) - len(cx)) / float(len(cy))
else:
n = (len(cxy) - len(cy)) / float(len(cx))
return n
# Select all packet profiles from the path directory provided
# For each unique pair, calculate NCD
#
#
##############################################
##############################################
## Main program starts here ##
##############################################
##############################################
location = sys.argv[3]
pathstring = sys.argv[1]
iteration = sys.argv[2]
listing = os.listdir(pathstring)
selection = []
for infile in listing:
print "current file is: " + infile
selection.append( infile )
print len(selection), selection[0]
now = "%s"%int(time.time())
ncdstr = "%s-ncd-out.txt"%now
upgmastr = "%s-upgma-out.txt"%now
spsumstr = "%s-spsum-out.txt"%now
combinedstr = "%s-combined-out.txt"%now
output = "output/"
os.system("mkdir output")
ncdfile = open( output+ncdstr, 'w')
spsumfile = open( output+spsumstr,'w')
combinedfile = open( output+combinedstr,'w')
## Reduntant upgma file, ncd is sufficient
upgmafile = open (output+upgmastr, 'w')
getcontext().prec = 4
for i in range(0, len(selection)):
# print i
for j in range(0, len(selection)):
fx = pathstring + '/' + str(selection[i])
fy = pathstring + '/' + str(selection[j])
print fx, fy
ncdscore = ncd(fx,fy)
ncdfile.write(str(selection[i]) + " " + str(selection[j]) + " " + str(+Decimal(str(ncdscore))) + "\n")
# upgmafile.write("\'" + str(selection[i]) + "\'" + "," + "\'" + str(selection[j]) + "\'" + "," + str(ncd(fx, fy))+ "\n")
## Reduntant upgma file, ncd is sufficient
upgmafile.write(str(selection[i]) + "," + str(selection[j]) + "," + str(+Decimal(str(ncdscore))) + "\n" )
## Get Scores ##
spsum = alldist(fx, fy)
combscore = float(spsum) + float(ncdscore)
# print "scores: ",lev, dlev, jaro, jarowink, ham, kl, spsum
# print "type: ", type(spsum)
## Write them out ##
spsumfile.write(str(selection[i]) + " " + str(selection[j]) + " " + str(spsum) + "\n")
combinedfile.write(str(selection[i]) + " " + str(selection[j]) + " " + str(combscore) + "\n")
ncdfile.close()
upgmafile.close()
spsumfile.close()
combinedfile.close()
print "Distances Successfully calculated and written out to files"
print "##########################################################"
print "Calculating graph"
os.system("python ncd-fimz-graph.py " +ncdstr + " " + "0.65")
os.system("python ncd-fimz-graph.py " +spsumstr + " " + "0.95")
os.system("python ncd-fimz-graph.py " +combinedstr + " " + "1.55")
graphncd = "output/graph-" + ncdstr
graphspsum = "output/graph-" + spsumstr
graphcombined = "output/graph-" + combinedstr
ncdpng = graphncd + ".svg"
spsumpng = graphspsum + ".svg"
combinedpng = graphcombined + ".svg"
print "Graph files Created"
print "##########################################################"
print "Running Neato on ncd"
os.system("sfdp -Tsvg "+ graphncd + " -o " + ncdpng)
#os.system("sfdp -Tpng "+ graphncd + " -o " + ".png")
print "Running Neato on spsum"
os.system("sfdp -Tsvg "+ graphspsum + " -o " + spsumpng)
print "Running Neato on combined"
os.system("sfdp -Tsvg "+ graphcombined + " -o " + combinedpng)
## Adding UPGMA support
print "Calculating UPGMA"
os.system("python upgma.py output/" + upgmastr)
## Removing list500.txt ##
#os.system("rm -f list500.txt")
## Append to output to create the list ##
f = open("list500.txt",'a')
#location = "/home/fimz/Dev/datasets/500-results"
finalncd = location + "/" + iteration + "/" + ncdstr
finalspsum = location + "/" + iteration + "/" + spsumstr
finalcombined = location + "/" + iteration + "/" + combinedstr
f.write(finalncd + "\n")
f.write(finalspsum + "\n")
f.write(finalcombined + "\n")
f.close()