def processDomainsList(domains, table): #prepare multisets of retrived (type, elements) from database table = map( lambda x: x[:3] + (multiset(map(lambda y: (str(y[0]), y[1]), x[3])), ) + x[4:], table) for domain in domains: #TODO: map from domain to webpage URL. Is it needed? elementsWebpage = processWebpage(domain) elementsWebpage = multiset(elementsWebpage) elementsWebpage = spamsum.spamsum(serializeElements(elementsWebpage)) notfound = True for row in table: sdeface = spamsum.spamsum(serializeElements(row[3])) #sdeface = row[3] #similarity = similarityIndex(map(lambda x: x[1], elementsWebpage), map(lambda x: x[1], sdeface)) similarity = spamsum.match(elementsWebpage, sdeface) if similarity >= 70: #TODO: Comparison Strategy!! notfound = False print "Defacement found at %s -> Notifier: %s, Signature ID: %s, Detected on: %s (%s%%)" % \ (domain.strip(), row[0], row[2], row[1], similarity) break if notfound: print "No defacement found (%s)" % (domain.strip(), )
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def test_match(self): self.assertEqual( spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s1)), 100) self.assertEqual( spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s2)), 72) self.assertEqual( spamsum.match(spamsum.spamsum(self.s2), spamsum.spamsum(self.s1)), 72) self.assertEqual( spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s3)), 0) self.assertEqual( spamsum.match(spamsum.spamsum(self.s2), spamsum.spamsum(self.s3)), 0)
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() xhash = spamsum.spamsum(xread) yhash = spamsum.spamsum(yread) spsum = spamsum.match(xhash,yhash) spsum = 100 - spsum spsum = float(spsum/100.00) return spsum
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() ## Take Reverse and append to original read ## # rxread = xread + xread[::-1] # ryread = yread + yread[::-1] # xhash = spamsum.spamsum(xread) # yhash = spamsum.spamsum(yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) return spsum
def alldist(filex, filey): xread = open(filex, "r").read() yread = open(filey, "r").read() ## Take Reverse and append to original read ## rxread = xread + xread[::-1] ryread = yread + yread[::-1] xhash = spamsum.spamsum(rxread) yhash = spamsum.spamsum(ryread) # fx = open("/home/fimz/datasets/500-dataset/rev/test/"+filex+".rev", 'w') # fx.write(xhash) # fy = open("/home/fimz/datasets/500-dataset/rev/test/"+filey+".rev", 'w') # fy.write(yhash) spsum = spamsum.match(xhash, yhash) spsum = 100 - spsum spsum = float(spsum / 100.00) return spsum
def similarityIndex(elementsWebpage, sdefaces): matchesTable = [] mSum = 0 #print "length" #print len(elementsWebpage), len(sdefaces) for i in sdefaces: matchesTable.append([]) for j in elementsWebpage: a = spamsum.spamsum(i) b = spamsum.spamsum(j) matchesTable[-1].append(spamsum.match(a, b)) if len(sdefaces) > len(elementsWebpage): #iters = itertools.combinations(range(0, len(sdefaces)), len(elementsWebpage)) for i in range(0, 10): s = random.sample(range(0, len(sdefaces)), len(elementsWebpage)) matchesTableP = map(lambda x: matchesTable[x], s) maxSim = 0 calculus(matchesTableP, [], 0) if maxSim > mSum: mSum = maxSim else: maxSim = 0 #print matchesTable calculus(matchesTable, [], 0) mSum = maxSim return mSum * 1.0 / len(sdefaces)