Example #1
0
def anomalyDetection(projectRoot, nmfCluster):
    from sklearn.metrics.pairwise import pairwise_distances

    anomalyCalculator = AnomalyCalculator()
    anomalyResults = []

    numberOfClusters = nmfCluster.getNumberOfClusters()
    for clusterId in range(numberOfClusters):
        (clusterMatrixLabels, clusterMatrix) = nmfCluster.getLowerDimensionalMatrixForCluster(clusterId)

        # editDistances = distanceMatrixFromLocations(projectRoot, clusterMatrixLabels)
        # if editDistances == None: continue

        nDatapointsInCluster = clusterMatrix.shape[1]
        if clusterMatrix.shape[0] > 0 and clusterMatrix.shape[1] > 0:
            averageDistanceInCluster = numpy.mean(pairwise_distances(clusterMatrix.T))
        else:
            averageDistanceInCluster = 0

        (gammaScores, zetaScores) = anomalyCalculator.anomalyAnalysis(clusterMatrix, nDatapointsInCluster)
        # (gammaScores, zetaScores) = anomalyCalculator.analyzeDistanceMatrix(editDistances, k=100)

        # distToPrototype = anomalyCalculator.dist2Prototype(nmfCluster.getPrototype(clusterId) , clusterMatrix)
        # D = [(gammaScores[i], zetaScores[i], distToPrototype[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))]
        D = [
            (gammaScores[i], averageDistanceInCluster, 0, clusterMatrixLabels[i])
            for i in range(len(clusterMatrixLabels))
        ]
        # D = [(gammaScores[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))]
        D.sort(reverse=True)
        anomalyResults.append(D)

    return anomalyResults
Example #2
0
def anomalyDetection(projectRoot, nmfCluster):
    from sklearn.metrics.pairwise import pairwise_distances

    anomalyCalculator = AnomalyCalculator()
    anomalyResults = []

    numberOfClusters = nmfCluster.getNumberOfClusters()
    for clusterId in range(numberOfClusters):
        (clusterMatrixLabels, clusterMatrix
         ) = nmfCluster.getLowerDimensionalMatrixForCluster(clusterId)

        # editDistances = distanceMatrixFromLocations(projectRoot, clusterMatrixLabels)
        # if editDistances == None: continue

        nDatapointsInCluster = clusterMatrix.shape[1]
        if clusterMatrix.shape[0] > 0 and clusterMatrix.shape[1] > 0:
            averageDistanceInCluster = numpy.mean(
                pairwise_distances(clusterMatrix.T))
        else:
            averageDistanceInCluster = 0

        (gammaScores, zetaScores) = anomalyCalculator.anomalyAnalysis(
            clusterMatrix, nDatapointsInCluster)
        # (gammaScores, zetaScores) = anomalyCalculator.analyzeDistanceMatrix(editDistances, k=100)

        # distToPrototype = anomalyCalculator.dist2Prototype(nmfCluster.getPrototype(clusterId) , clusterMatrix)
        # D = [(gammaScores[i], zetaScores[i], distToPrototype[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))]
        D = [(gammaScores[i], averageDistanceInCluster, 0,
              clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))]
        # D = [(gammaScores[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))]
        D.sort(reverse=True)
        anomalyResults.append(D)

    return anomalyResults
def relevancyWeighting(checkVectors, featureDir):

    k = 20

    termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl'))
    functionLocations = termDocMatrix.index2Doc

    # it doesn't make much sense that we use euclidean distances here,
    # should be L1, but I can't calculate L1 on the sparse matrices for now.
    from scipy.spatial.distance import squareform
    D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl')))
    anomalyCalculator = AnomalyCalculator()
    (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D)

    WDict = NameToDictMap()
    for i in xrange(len(functionLocations)):

        location = functionLocations[i]
        if not location in checkVectors.d:
            continue

        WDict.d[location] = checkVectors.d[location]

        indices = NNI[:, i]
        gamma = float(numpy.sum(NNV[:, i])) / k
        locations = [functionLocations[j] for j in indices]
        V = [checkVectors.d[l] for l in locations if l in checkVectors.d]
        distances = [
            NNV[j, i] for j in xrange(len(locations))
            if locations[j] in checkVectors.d
        ]

        # len(V) may be unequal to k if at least one of the nearest neighbours has no checks.
        # It is then a null-vector, so we're implicitly adding it in mean-calculation
        meanVector = {}
        for (v, d) in zip(V, distances):

            for (name, score) in v.iteritems():
                try:
                    meanVector[name] += (1 - d) * (float(score) / k)
                except KeyError:
                    meanVector[name] = (1 - d) * (float(score) / k)

        for (name, score) in checkVectors.d[location].iteritems():
            if meanVector.has_key(name):
                score -= meanVector[name]
                if score < 0: score = 0
                WDict.setItem(name, location, score)
    return WDict
def relevancyWeighting(checkVectors, featureDir):
    
    k = 20
    
    termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl'))
    functionLocations = termDocMatrix.index2Doc

    # it doesn't make much sense that we use euclidean distances here,
    # should be L1, but I can't calculate L1 on the sparse matrices for now.
    from scipy.spatial.distance import squareform
    D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl')))
    anomalyCalculator = AnomalyCalculator()
    (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D)
    
    WDict = NameToDictMap()
    for i in xrange(len(functionLocations)):
        
        location = functionLocations[i]
        if not location in checkVectors.d:
            continue
        
        WDict.d[location] = checkVectors.d[location]
        
        indices = NNI[:,i]
        gamma = float(numpy.sum(NNV[:,i]))/k
        locations = [functionLocations[j] for j in indices]       
        V = [checkVectors.d[l] for l in locations if l in checkVectors.d]
        distances = [NNV[j,i] for j in xrange(len(locations)) if locations[j] in checkVectors.d]
        
        # len(V) may be unequal to k if at least one of the nearest neighbours has no checks.
        # It is then a null-vector, so we're implicitly adding it in mean-calculation
        meanVector = {}
        for (v,d) in zip(V,distances):
            
            for (name, score) in v.iteritems():
                try:
                    meanVector[name] += (1-d)* (float(score)/k)
                except KeyError:
                    meanVector[name] = (1-d)* (float(score)/k)
        

        for (name, score) in checkVectors.d[location].iteritems():
            if meanVector.has_key(name):
                score -= meanVector[name]
                if score < 0: score = 0
                WDict.setItem(name, location, score)
    return WDict
Example #5
0
    def anomalyDetection(self, nmfCluster):

        anomalyCalculator = AnomalyCalculator()

        docs = nmfCluster.H
        nDocs = docs.shape[1]
        labels = nmfCluster.labels

        # scores = anomalyCalculator.anomalyAnalysis(docs, nDocs)
        scores = self.determineNMFErrorVecs(nmfCluster)
        anomalyResults = [(scores[i], labels[i]) for i in range(nDocs)]
        anomalyResults.sort(reverse=True)

        return anomalyResults
Example #6
0
 def determineAnomaliesFromDistanceMatrix(self, anomalyScore, k):
     anomalyCalculator = AnomalyCalculator()
     scores = anomalyCalculator.analyzeDistanceMatrix(
         self.D, anomalyScore, k)
     return scores
Example #7
0
 def determineAnomaliesFromDistanceMatrix(self, anomalyScore, k):
     anomalyCalculator = AnomalyCalculator()
     scores = anomalyCalculator.analyzeDistanceMatrix(self.D, anomalyScore, k)
     return scores