def run(self): if self.items is None: write("[+] Loading items:") if self.debug else '' self.items = json.loads(open(options['--input']).read()) self.data = Data(self.items, preprocess=options['--preprocess'], debug=self.debug) write("\n[+] Creating the vector space:") if self.debug else '' vsm = VectorSpace(self.data.texts, method=options['--method'], debug=self.debug) write("[+] Generating recommendations".ljust( 54, '.')) if self.debug else '' rec_items = self.generate_rec(vsm) write("[OK]\n") if self.debug else '' write("[+] Saving to output file".ljust( 54, '.')) if options['--debug'] else '' json.dump(rec_items, open(options['--output'], 'w')) write("[OK]\n") print "[x] Finished."
request = request + line line = txt.readline() if line == '': break list_request.append(request) request = '' print len(list_request) return list_request # vector_space = VectorSpace(get_requests('testing_attacks.txt')) # print "list: ", vector_space.search(["select * from where drop table"]) list_request = get_requests('testing_attacks.txt') vector_space = VectorSpace(list_request) attacks = [ ("sql", "select * from where drop table statement odbc union"), ("lce", "dir c /winnt/system32/cmd.exe"), ("pt", "virtual include file"), ("SSIA", "virtual include statement odbc progra"), ("XI", "path count child text position comment"), ("CS", "document.cookie alert javascript document.location.replace url http"), ("LI", "had* objectclass *o brien* netscaperoot"), ] print "attacks[0][0]: ", attacks[0][0] comm = MPI.COMM_WORLD rank = comm.Get_rank()
from vector_space import VectorSpace vector_space = VectorSpace([ "The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.", "I haven't got a hat." ]) print("start") print(vector_space.search(["cat"])) print(vector_space.ralated(0))
from vector_space import VectorSpace vector_space = VectorSpace(['GET http:id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito HTTP/1.1', 'POST httpid=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito']) #Search for cat print vector_space.search(["select * from where drop table"])
from fastapi import FastAPI from vector_space import VectorSpace from org_dataset import OrgDataset from org_recommender import OrgRecommender from clusterer import Clusterer from keyword_finder import KeywordFinder from keyword_matcher import KeywordMatcher from gcd_utils import get_account_liked_tags app = FastAPI() dataset = OrgDataset.load_instance('./orgs.pkl') vs = VectorSpace.load_instance('./test_vs.pkl') recommender = OrgRecommender(dataset, vs) c = Clusterer(dataset, vs, 20) kw_finder = KeywordFinder(dataset, vs) matcher = KeywordMatcher(c, kw_finder, vs.data_centroid) @app.get('/get_init_recs/') async def get_init_recs(userId: str, numOrgs: int): keywords = get_account_liked_tags(userId) centroid = matcher.get_kw_centroid(keywords) orgids = recommender.centroid_recommend(centroid, numOrgs) return_arr = [] for id in orgids: entry = {'orgId': id} return_arr.append(entry) return return_arr """Example get request for api on local host: http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2
def __init__(self, dataCacheFilePath): self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath) self._vectorSpace = VectorSpace(self._populateDocuments()) self._vectorsInfo = self._vectorSpace.getAllDocumentVectors()
class KMeans: _bingDocsGenerator = None _vectorSpace = None _vectorsInfo = [{'vector':[], 'class':'texas aggies'}] _classList = ['texas aggies', 'texas longhorns', 'duke blue devils','dallas cowboys', 'dallas mavericks'] _expectedNumberOfClusters = 6 _clusters = {'cluster1' : {'vectorIndices':set(), 'center' : [], 'assignedclasses':{'texas aggies': 0} } } def __init__(self, dataCacheFilePath): self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath) self._vectorSpace = VectorSpace(self._populateDocuments()) self._vectorsInfo = self._vectorSpace.getAllDocumentVectors() def _populateDocuments(self): print 'Getting documents for clustering' jsonData = self._bingDocsGenerator.getDocuments(self._classList) return jsonData def _generateRandomPoint(self): numberOfPoints = len(self._vectorsInfo) randomIndex = randint(0, numberOfPoints - 1) randomPoint = self._vectorsInfo[randomIndex] return randomPoint def _getDistance(self, docVector1, docVector2): return self._vectorSpace.getEuclidianDistance(docVector1, docVector2) #return 1 - self._vectorSpace.getCosineSimilarity(docVector1, docVector2) def _isContained(self, vectorList, vectorElement): for vector in vectorList: if self._vectorSpace._areEqual(vector['vector'], vectorElement['vector']): return True return False def _initializeClusters(self): print 'Initializing clusters' randomCenters = [] while True: randomCenter = self._generateRandomPoint() if not self._isContained(randomCenters, randomCenter): randomCenters.append(randomCenter) if len(randomCenters) == self._expectedNumberOfClusters: break tempClusters = {} clusterIndex = 0 for randomCenter in randomCenters: tempCluster = {'center' : randomCenter, 'vectorIndices' : set()} tempClusterName = 'cluster' + str(clusterIndex) clusterIndex += 1 tempClusters[tempClusterName] = tempCluster self._clusters = tempClusters def _calculateRSS(self): distanceSum = 0 for clusterName in self._clusters: cluster = self._clusters[clusterName] clusterCenter = cluster['center']['vector'] for vectorIndex in cluster['vectorIndices']: distance = self._getDistance(clusterCenter, self._vectorsInfo[vectorIndex]['vector']) distanceSum += distance RSS = distanceSum / len(self._clusters) return RSS def _findClosestCluster(self, docVector): distanceList = [] for clusterName in self._clusters: cluster = self._clusters[clusterName] clusterCenter = cluster['center']['vector'] distance = self._getDistance(clusterCenter, docVector) distanceList.append((clusterName, distance)) closestClusterName = min(distanceList, key=itemgetter(1))[0] return closestClusterName def _assignDocVectorToCluster(self, docVectorIndex, clusterName): cluster = self._clusters[clusterName] cluster['vectorIndices'].add(docVectorIndex) docVectorClass = self._vectorsInfo[docVectorIndex]['class'] if not 'assignedclasses' in cluster: cluster['assignedclasses'] = {} assignedClasses = cluster['assignedclasses'] if docVectorClass in assignedClasses: assignedClasses[docVectorClass] += 1 else: assignedClasses[docVectorClass] = 1 def _calculateCentroids(self): for clusterName in self._clusters: cluster = self._clusters[clusterName] vectors = [] for vectorIndex in cluster['vectorIndices']: vectors.append(self._vectorsInfo[vectorIndex]['vector']) if len(vectors) > 0: centroid = self._vectorSpace.getCentroid(vectors) cluster['center'] = {'vector': centroid} def _removeEmptyClusters(self): emptyClusterNames = [] for clusterName in self._clusters: if len(self._clusters[clusterName]['vectorIndices']) == 0: emptyClusterNames.append(clusterName) for emptyClusterName in emptyClusterNames: self._clusters.pop(emptyClusterName) def _clearClusterMembers(self): for clusterName in self._clusters: cluster = self._clusters[clusterName] cluster['vectorIndices'].clear() cluster['assignedclasses'].clear() def clusterPoints(self): print 'Clustering Points' self._initializeClusters() iterCount = 0 while True: vectorIndex = 0 # Assign all vectors to clusters for vectorInfo in self._vectorsInfo: closestClusterName = self._findClosestCluster(vectorInfo['vector']) self._assignDocVectorToCluster(vectorIndex, closestClusterName) vectorIndex += 1 print 'Iteration---' for clusterName in self._clusters: print clusterName ,'--->' ,self._clusters[clusterName]['assignedclasses'] RI = self.getRandIndex() print 'RI : ', RI RSS = self._calculateRSS() print 'RSS:', RSS purity = self.getPurity() print 'Purity is', purity if iterCount > 10: print 'Restarting !!----------------------------------------------------------------------' self._initializeClusters() iterCount = 0 continue self._calculateCentroids() self._clearClusterMembers() iterCount += 1 def _getVectorClassCounts(self): vectorCountDict = {} for clusterName in self._clusters: cluster = self._clusters[clusterName] vectorCountDict[clusterName] = {} for className in self._classList: vectorCountDict[clusterName][className] = 0 for vectorIndex in cluster['vectorIndices']: if self._vectorsInfo[vectorIndex]['class'] == className: vectorCountDict[clusterName][className] += 1 return vectorCountDict def getMaxCountClass(self, vectorCountDict, clusterName): maxCount = 0 classCountDict = vectorCountDict[clusterName] for className in classCountDict: classCount = classCountDict[className] if classCount > maxCount: maxCount = classCount return maxCount def _belongToSameCluster(self, vectorIndex1, vectorIndex2): for clusterName in self._clusters: cluster = self._clusters[clusterName] vectorIndices = cluster['vectorIndices'] if vectorIndex1 in vectorIndices and vectorIndex2 in vectorIndices: return True return False def getPurity(self): print 'Calculating purity' vectorCountDict = self._getVectorClassCounts() maxCountSum = 0 for clusterName in vectorCountDict: maxCount = self.getMaxCountClass(vectorCountDict, clusterName) maxCountSum += maxCount purity = maxCountSum / float(len(self._vectorsInfo)) return purity def getRandIndex(self): print 'Calculating Rand Index' falsePositivesCount = 0 falseNegativesCount = 0 truePositivesCount = 0 trueNegativesCount = 0 iterCount = 0 for vectorIndex1 in range(len(self._vectorsInfo)): for vectorIndex2 in range(len(self._vectorsInfo)): if vectorIndex1 == vectorIndex2: continue else: iterCount += 1 vectorInfo1 = self._vectorsInfo[vectorIndex1] vectorInfo2 = self._vectorsInfo[vectorIndex2] haveSameClass = vectorInfo1['class'] == vectorInfo2['class'] haveSameCluster = self._belongToSameCluster(vectorIndex1, vectorIndex2) if haveSameClass: if haveSameCluster: truePositivesCount += 1 else : falseNegativesCount += 1 else: if haveSameCluster: falsePositivesCount += 1 else : trueNegativesCount += 1 print 'Number of iterations :', iterCount print 'TN:', trueNegativesCount/ 2, ' TP: ', truePositivesCount/2 print 'FP: ', falsePositivesCount/2, 'FN: ', falseNegativesCount/2 total = trueNegativesCount + truePositivesCount + falseNegativesCount + falsePositivesCount RI = float(truePositivesCount + trueNegativesCount) / total return RI
(line.find('GET http') == -1), (line.find('POST') == -1)): request = request + line line = txt.readline() if line == '': break list_request.append(request) request = '' # print len(list_request) return list_request # list_request = get_requests('anomalousTrafficTest.txt') list_request = get_requests('testing_attacks.txt') vector_space = VectorSpace(list_request) print "list sql: ", vector_space.search( ["select * from where drop table statement odbc union"]) print "list command execution: ", vector_space.search( ["dir c /winnt/system32/cmd.exe"]) print "list path traversal: ", vector_space.search( ["virtual include file"]) print "list SSI Atack: ", vector_space.search( ["virtual include statement odbc progra"]) print "list XPATH injection: ", vector_space.search( ["path count child text position comment"]) print "list Cross Site Scripting: ", vector_space.search( ["document.cookie alert javascript document.location.replace url http "]) print "list LDAP Injection: ", vector_space.search([ "had* objectclass *o brien* netscaperoot"])
from vector_space import VectorSpace vector_space = VectorSpace(["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."]) print("start") print(vector_space.search(["cat"])) print(vector_space.ralated(0))