def anomalyDetection(projectRoot, nmfCluster): from sklearn.metrics.pairwise import pairwise_distances anomalyCalculator = AnomalyCalculator() anomalyResults = [] numberOfClusters = nmfCluster.getNumberOfClusters() for clusterId in range(numberOfClusters): (clusterMatrixLabels, clusterMatrix) = nmfCluster.getLowerDimensionalMatrixForCluster(clusterId) # editDistances = distanceMatrixFromLocations(projectRoot, clusterMatrixLabels) # if editDistances == None: continue nDatapointsInCluster = clusterMatrix.shape[1] if clusterMatrix.shape[0] > 0 and clusterMatrix.shape[1] > 0: averageDistanceInCluster = numpy.mean(pairwise_distances(clusterMatrix.T)) else: averageDistanceInCluster = 0 (gammaScores, zetaScores) = anomalyCalculator.anomalyAnalysis(clusterMatrix, nDatapointsInCluster) # (gammaScores, zetaScores) = anomalyCalculator.analyzeDistanceMatrix(editDistances, k=100) # distToPrototype = anomalyCalculator.dist2Prototype(nmfCluster.getPrototype(clusterId) , clusterMatrix) # D = [(gammaScores[i], zetaScores[i], distToPrototype[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))] D = [ (gammaScores[i], averageDistanceInCluster, 0, clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels)) ] # D = [(gammaScores[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))] D.sort(reverse=True) anomalyResults.append(D) return anomalyResults
def anomalyDetection(projectRoot, nmfCluster): from sklearn.metrics.pairwise import pairwise_distances anomalyCalculator = AnomalyCalculator() anomalyResults = [] numberOfClusters = nmfCluster.getNumberOfClusters() for clusterId in range(numberOfClusters): (clusterMatrixLabels, clusterMatrix ) = nmfCluster.getLowerDimensionalMatrixForCluster(clusterId) # editDistances = distanceMatrixFromLocations(projectRoot, clusterMatrixLabels) # if editDistances == None: continue nDatapointsInCluster = clusterMatrix.shape[1] if clusterMatrix.shape[0] > 0 and clusterMatrix.shape[1] > 0: averageDistanceInCluster = numpy.mean( pairwise_distances(clusterMatrix.T)) else: averageDistanceInCluster = 0 (gammaScores, zetaScores) = anomalyCalculator.anomalyAnalysis( clusterMatrix, nDatapointsInCluster) # (gammaScores, zetaScores) = anomalyCalculator.analyzeDistanceMatrix(editDistances, k=100) # distToPrototype = anomalyCalculator.dist2Prototype(nmfCluster.getPrototype(clusterId) , clusterMatrix) # D = [(gammaScores[i], zetaScores[i], distToPrototype[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))] D = [(gammaScores[i], averageDistanceInCluster, 0, clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))] # D = [(gammaScores[i], clusterMatrixLabels[i]) for i in range(len(clusterMatrixLabels))] D.sort(reverse=True) anomalyResults.append(D) return anomalyResults
def relevancyWeighting(checkVectors, featureDir): k = 20 termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl')) functionLocations = termDocMatrix.index2Doc # it doesn't make much sense that we use euclidean distances here, # should be L1, but I can't calculate L1 on the sparse matrices for now. from scipy.spatial.distance import squareform D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl'))) anomalyCalculator = AnomalyCalculator() (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D) WDict = NameToDictMap() for i in xrange(len(functionLocations)): location = functionLocations[i] if not location in checkVectors.d: continue WDict.d[location] = checkVectors.d[location] indices = NNI[:, i] gamma = float(numpy.sum(NNV[:, i])) / k locations = [functionLocations[j] for j in indices] V = [checkVectors.d[l] for l in locations if l in checkVectors.d] distances = [ NNV[j, i] for j in xrange(len(locations)) if locations[j] in checkVectors.d ] # len(V) may be unequal to k if at least one of the nearest neighbours has no checks. # It is then a null-vector, so we're implicitly adding it in mean-calculation meanVector = {} for (v, d) in zip(V, distances): for (name, score) in v.iteritems(): try: meanVector[name] += (1 - d) * (float(score) / k) except KeyError: meanVector[name] = (1 - d) * (float(score) / k) for (name, score) in checkVectors.d[location].iteritems(): if meanVector.has_key(name): score -= meanVector[name] if score < 0: score = 0 WDict.setItem(name, location, score) return WDict
def relevancyWeighting(checkVectors, featureDir): k = 20 termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl')) functionLocations = termDocMatrix.index2Doc # it doesn't make much sense that we use euclidean distances here, # should be L1, but I can't calculate L1 on the sparse matrices for now. from scipy.spatial.distance import squareform D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl'))) anomalyCalculator = AnomalyCalculator() (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D) WDict = NameToDictMap() for i in xrange(len(functionLocations)): location = functionLocations[i] if not location in checkVectors.d: continue WDict.d[location] = checkVectors.d[location] indices = NNI[:,i] gamma = float(numpy.sum(NNV[:,i]))/k locations = [functionLocations[j] for j in indices] V = [checkVectors.d[l] for l in locations if l in checkVectors.d] distances = [NNV[j,i] for j in xrange(len(locations)) if locations[j] in checkVectors.d] # len(V) may be unequal to k if at least one of the nearest neighbours has no checks. # It is then a null-vector, so we're implicitly adding it in mean-calculation meanVector = {} for (v,d) in zip(V,distances): for (name, score) in v.iteritems(): try: meanVector[name] += (1-d)* (float(score)/k) except KeyError: meanVector[name] = (1-d)* (float(score)/k) for (name, score) in checkVectors.d[location].iteritems(): if meanVector.has_key(name): score -= meanVector[name] if score < 0: score = 0 WDict.setItem(name, location, score) return WDict
def anomalyDetection(self, nmfCluster): anomalyCalculator = AnomalyCalculator() docs = nmfCluster.H nDocs = docs.shape[1] labels = nmfCluster.labels # scores = anomalyCalculator.anomalyAnalysis(docs, nDocs) scores = self.determineNMFErrorVecs(nmfCluster) anomalyResults = [(scores[i], labels[i]) for i in range(nDocs)] anomalyResults.sort(reverse=True) return anomalyResults
def determineAnomaliesFromDistanceMatrix(self, anomalyScore, k): anomalyCalculator = AnomalyCalculator() scores = anomalyCalculator.analyzeDistanceMatrix( self.D, anomalyScore, k) return scores
def determineAnomaliesFromDistanceMatrix(self, anomalyScore, k): anomalyCalculator = AnomalyCalculator() scores = anomalyCalculator.analyzeDistanceMatrix(self.D, anomalyScore, k) return scores