Python VectorSpaceの例、vector_space.VectorSpace Pythonの例

コード例 #1

0

ファイルを表示

ファイル: generate.py プロジェクト: pi19404/cbrec

    def run(self):
        if self.items is None:
            write("[+] Loading items:") if self.debug else ''
            self.items = json.loads(open(options['--input']).read())
        self.data = Data(self.items,
                         preprocess=options['--preprocess'],
                         debug=self.debug)

        write("\n[+] Creating the vector space:") if self.debug else ''
        vsm = VectorSpace(self.data.texts,
                          method=options['--method'],
                          debug=self.debug)

        write("[+] Generating recommendations".ljust(
            54, '.')) if self.debug else ''
        rec_items = self.generate_rec(vsm)
        write("[OK]\n") if self.debug else ''

        write("[+] Saving to output file".ljust(
            54, '.')) if options['--debug'] else ''
        json.dump(rec_items, open(options['--output'], 'w'))
        write("[OK]\n")
        print "[x] Finished."

コード例 #2

0

ファイルを表示

ファイル: parallel_process.py プロジェクト: laloxxx20/FinalParalelos

                request = request + line
                line = txt.readline()
                if line == '':
                    break
            list_request.append(request)
            request = ''
    print len(list_request)
    return list_request


# vector_space = VectorSpace(get_requests('testing_attacks.txt'))
# print "list: ", vector_space.search(["select * from where drop table"])


list_request = get_requests('testing_attacks.txt')
vector_space = VectorSpace(list_request)

attacks = [
    ("sql", "select * from where drop table statement odbc union"),
    ("lce", "dir c /winnt/system32/cmd.exe"),
    ("pt", "virtual include file"),
    ("SSIA", "virtual include statement odbc progra"),
    ("XI", "path count child text position comment"),
    ("CS", "document.cookie alert javascript document.location.replace url http"),
    ("LI", "had* objectclass *o  brien* netscaperoot"),
]

print "attacks[0][0]: ", attacks[0][0]

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

コード例 #3

0

ファイルを表示

from vector_space import VectorSpace

vector_space = VectorSpace([
    "The cat in the hat disabled", "A cat is a fine pet ponies.",
    "Dogs and cats make good pets.", "I haven't got a hat."
])
print("start")
print(vector_space.search(["cat"]))
print(vector_space.ralated(0))

コード例 #4

0

ファイルを表示

ファイル: testing.py プロジェクト: laloxxx20/FinalParalelos

from vector_space import VectorSpace


vector_space = VectorSpace(['GET http:id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito HTTP/1.1', 'POST httpid=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantidad=%27%3B+DROP+TABLE+usuarios%3B+SELECT+*+FROM+datos+WHERE+nombre+LIKE+%27%25&B1=A%F1adir+al+carrito'])

#Search for cat
print vector_space.search(["select * from where drop table"])

コード例 #5

0

ファイルを表示

ファイル: api.py プロジェクト: Benmckenzie96/csce-482-ml

from fastapi import FastAPI
from vector_space import VectorSpace
from org_dataset import OrgDataset
from org_recommender import OrgRecommender
from clusterer import Clusterer
from keyword_finder import KeywordFinder
from keyword_matcher import KeywordMatcher
from gcd_utils import get_account_liked_tags

app = FastAPI()
dataset = OrgDataset.load_instance('./orgs.pkl')
vs = VectorSpace.load_instance('./test_vs.pkl')
recommender = OrgRecommender(dataset, vs)

c = Clusterer(dataset, vs, 20)
kw_finder = KeywordFinder(dataset, vs)
matcher = KeywordMatcher(c, kw_finder, vs.data_centroid)

@app.get('/get_init_recs/')
async def get_init_recs(userId: str, numOrgs: int):
    keywords = get_account_liked_tags(userId)
    centroid = matcher.get_kw_centroid(keywords)
    orgids = recommender.centroid_recommend(centroid, numOrgs)
    return_arr = []
    for id in orgids:
        entry = {'orgId': id}
        return_arr.append(entry)
    return return_arr
"""Example get request for api on local host:

http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2

コード例 #6

0

ファイルを表示

ファイル: clustering.py プロジェクト: hbarthwal/course_projects

 def __init__(self, dataCacheFilePath):
     self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath)
     self._vectorSpace = VectorSpace(self._populateDocuments())
     self._vectorsInfo =  self._vectorSpace.getAllDocumentVectors()

コード例 #7

0

ファイルを表示

ファイル: clustering.py プロジェクト: hbarthwal/course_projects

class KMeans:
    
    _bingDocsGenerator = None
    _vectorSpace = None
    _vectorsInfo = [{'vector':[], 'class':'texas aggies'}]
    _classList = ['texas aggies', 'texas longhorns',
                 'duke blue devils','dallas cowboys',
                 'dallas mavericks']
    _expectedNumberOfClusters = 6
    _clusters = {'cluster1' : {'vectorIndices':set(), 'center' : [], 
                               'assignedclasses':{'texas aggies': 0}
                            }
                }
    
    def __init__(self, dataCacheFilePath):
        self._bingDocsGenerator = DocumentsGenerator(dataCacheFilePath)
        self._vectorSpace = VectorSpace(self._populateDocuments())
        self._vectorsInfo =  self._vectorSpace.getAllDocumentVectors()
    
    def _populateDocuments(self):
        print 'Getting documents for clustering'
        jsonData = self._bingDocsGenerator.getDocuments(self._classList)
        return jsonData
    
    def _generateRandomPoint(self):
        numberOfPoints = len(self._vectorsInfo)
        randomIndex = randint(0, numberOfPoints - 1)
        randomPoint = self._vectorsInfo[randomIndex]
        return randomPoint
    
    def _getDistance(self, docVector1, docVector2):
        return self._vectorSpace.getEuclidianDistance(docVector1, docVector2)
        #return 1 - self._vectorSpace.getCosineSimilarity(docVector1, docVector2)
        
    def _isContained(self, vectorList, vectorElement):
        for vector in vectorList:
            if self._vectorSpace._areEqual(vector['vector'], vectorElement['vector']):
                return True
        return False
        
    def _initializeClusters(self):
        print 'Initializing clusters'
        randomCenters = []
        while True:
            randomCenter = self._generateRandomPoint()
            
            if not self._isContained(randomCenters, randomCenter):
                randomCenters.append(randomCenter)
            
            if len(randomCenters) == self._expectedNumberOfClusters:
                break
        
        tempClusters = {}
        clusterIndex = 0
        for randomCenter in randomCenters:
            tempCluster = {'center' : randomCenter, 'vectorIndices' : set()}
            tempClusterName = 'cluster' + str(clusterIndex)
            clusterIndex += 1
            tempClusters[tempClusterName] = tempCluster
        self._clusters = tempClusters
        
    def _calculateRSS(self):
        distanceSum = 0
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            clusterCenter = cluster['center']['vector']
            for vectorIndex in cluster['vectorIndices']:
                distance = self._getDistance(clusterCenter, self._vectorsInfo[vectorIndex]['vector'])
                distanceSum += distance
        RSS = distanceSum / len(self._clusters)
        return RSS
    
    def _findClosestCluster(self, docVector):
        distanceList = []
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            clusterCenter = cluster['center']['vector']
            distance = self._getDistance(clusterCenter, docVector)
            distanceList.append((clusterName, distance))
        closestClusterName = min(distanceList, key=itemgetter(1))[0]
        return closestClusterName
     
    def _assignDocVectorToCluster(self, docVectorIndex, clusterName):
        cluster = self._clusters[clusterName]
        cluster['vectorIndices'].add(docVectorIndex)
        docVectorClass = self._vectorsInfo[docVectorIndex]['class']
        if not 'assignedclasses' in cluster:
            cluster['assignedclasses'] = {}
        assignedClasses = cluster['assignedclasses']
        if docVectorClass in assignedClasses:
            assignedClasses[docVectorClass] += 1
        else:
            assignedClasses[docVectorClass] = 1
      
    def _calculateCentroids(self):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectors = []
            for vectorIndex in cluster['vectorIndices']:
                vectors.append(self._vectorsInfo[vectorIndex]['vector'])
            if len(vectors) > 0: 
                centroid = self._vectorSpace.getCentroid(vectors)
                cluster['center'] = {'vector': centroid}

    def _removeEmptyClusters(self):
        emptyClusterNames = []
        for clusterName in self._clusters:
            if len(self._clusters[clusterName]['vectorIndices']) == 0:
                emptyClusterNames.append(clusterName)
        for emptyClusterName in emptyClusterNames:
            self._clusters.pop(emptyClusterName)
            
    def _clearClusterMembers(self):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            cluster['vectorIndices'].clear()
            cluster['assignedclasses'].clear()
    
    def clusterPoints(self):
        print 'Clustering Points'
        self._initializeClusters()
        iterCount = 0
        while True:
            vectorIndex = 0
            # Assign all vectors to clusters
            for vectorInfo in self._vectorsInfo:
                closestClusterName = self._findClosestCluster(vectorInfo['vector'])
                self._assignDocVectorToCluster(vectorIndex, closestClusterName)
                vectorIndex += 1
            print 'Iteration---'
            for clusterName in self._clusters:
                print clusterName ,'--->' ,self._clusters[clusterName]['assignedclasses']
            
            RI = self.getRandIndex()
            print 'RI : ', RI            
            RSS = self._calculateRSS()
            print 'RSS:', RSS
            purity = self.getPurity()
            print 'Purity is', purity

            if iterCount > 10:
                print 'Restarting !!----------------------------------------------------------------------'
                self._initializeClusters()
                iterCount = 0
                continue
                        
            self._calculateCentroids()
            self._clearClusterMembers()
            iterCount += 1
            

    def _getVectorClassCounts(self):
        vectorCountDict = {}
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectorCountDict[clusterName] = {}
            for className in self._classList:
                vectorCountDict[clusterName][className] = 0
                for vectorIndex in cluster['vectorIndices']:
                    if self._vectorsInfo[vectorIndex]['class'] == className:
                        vectorCountDict[clusterName][className] += 1
        return vectorCountDict


    def getMaxCountClass(self, vectorCountDict, clusterName):
            maxCount = 0
            classCountDict = vectorCountDict[clusterName]
            for className in classCountDict:
                classCount = classCountDict[className]
                if classCount > maxCount:
                    maxCount = classCount
            return maxCount

  
    def _belongToSameCluster(self, vectorIndex1, vectorIndex2):
        for clusterName in self._clusters:
            cluster = self._clusters[clusterName]
            vectorIndices = cluster['vectorIndices']
            if  vectorIndex1 in vectorIndices and vectorIndex2 in vectorIndices:
                return True
        return False
        
    def getPurity(self):
        print 'Calculating purity'
        vectorCountDict = self._getVectorClassCounts()
        maxCountSum = 0
        for clusterName in vectorCountDict:
            maxCount = self.getMaxCountClass(vectorCountDict, clusterName)
            maxCountSum += maxCount
        purity = maxCountSum / float(len(self._vectorsInfo))
        return purity
    
    def getRandIndex(self):
        print 'Calculating Rand Index'
        falsePositivesCount = 0
        falseNegativesCount = 0
        truePositivesCount = 0
        trueNegativesCount = 0
        iterCount = 0
        for vectorIndex1 in range(len(self._vectorsInfo)):
            for vectorIndex2 in range(len(self._vectorsInfo)):
                if vectorIndex1 == vectorIndex2:
                    continue
                else:
                    iterCount += 1
                    vectorInfo1 = self._vectorsInfo[vectorIndex1]
                    vectorInfo2 = self._vectorsInfo[vectorIndex2]
                    haveSameClass = vectorInfo1['class'] == vectorInfo2['class']
                    haveSameCluster = self._belongToSameCluster(vectorIndex1, vectorIndex2)

                    if haveSameClass:
                        if haveSameCluster:
                            truePositivesCount += 1
                        else :
                            falseNegativesCount += 1
                    else:
                        if haveSameCluster:
                            falsePositivesCount += 1
                        else :
                            trueNegativesCount += 1
        print 'Number of iterations :', iterCount
        print 'TN:', trueNegativesCount/ 2, ' TP: ', truePositivesCount/2
        print 'FP: ', falsePositivesCount/2, 'FN: ', falseNegativesCount/2
        total = trueNegativesCount + truePositivesCount + falseNegativesCount + falsePositivesCount
        RI = float(truePositivesCount + trueNegativesCount) / total
        return RI

コード例 #8

0

ファイルを表示

ファイル: single_process.py プロジェクト: laloxxx20/FinalParalelos

                        (line.find('GET http') == -1),
                        (line.find('POST') == -1)):
                request = request + line
                line = txt.readline()
                if line == '':
                    break
            list_request.append(request)
            request = ''
    # print len(list_request)
    return list_request


# list_request = get_requests('anomalousTrafficTest.txt')
list_request = get_requests('testing_attacks.txt')

vector_space = VectorSpace(list_request)
print "list sql: ", vector_space.search(
    ["select * from where drop table statement odbc union"])
print "list command execution: ", vector_space.search(
    ["dir c /winnt/system32/cmd.exe"])
print "list path traversal: ", vector_space.search(
    ["virtual include file"])
print "list SSI Atack: ", vector_space.search(
    ["virtual include statement odbc progra"])
print "list XPATH injection: ", vector_space.search(
    ["path count child text position comment"])
print "list Cross Site Scripting: ", vector_space.search(
    ["document.cookie alert javascript document.location.replace url http "])
print "list LDAP Injection: ", vector_space.search([
    "had* objectclass *o  brien* netscaperoot"])

コード例 #9

0

ファイルを表示

ファイル: app.py プロジェクト: agune/kisa

from vector_space import VectorSpace

vector_space = VectorSpace(["The cat in the hat disabled", "A cat is a fine pet ponies.", "Dogs and cats make good pets.","I haven't got a hat."])
print("start")
print(vector_space.search(["cat"]))
print(vector_space.ralated(0))