def _termDictsFromContext(self, context, symbol):
        
        vecs = NameToDictMap()
        allNgrams = OccurrenceCounter()
        
        context.neighbours.append(context.origin)
        
        for neighbour in context.neighbours:
            nOcc = neighbour.nOccurrences
            location = neighbour.location
            expressions = self.treeToExprConverter.getExpressionsForSymbol(location, symbol)
            # expressions.append('@+$_+@')
            # expressions.append('@+EXPR@+$_+@+@')
            
            # print 'FOO %s: %s: %s' % (symbol, location, expressions)
            
            neighbour.setExpressions(expressions)

            # add null-vector for function if it does not contain expressions
            if len(expressions) == 0:
                vecs.add(None, location)
            
            for expr in expressions:
                # vecs.add(expr, location, 1.0/nOcc)
                # vecs.add(expr, location, 1.0)
                vecs.setItem(expr, location, 1.0)
                allNgrams.add(expr)            
        
        context.neighbours.pop()
        
        if len(vecs.d) == 0 or len(allNgrams.d) == 0:
            return None
    
        return (vecs, allNgrams)
Esempio n. 2
0
class SinkMatrixCreator:
    def __init__(self, projectRoot):
        self.projectRoot = projectRoot
        self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/'
        self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../')

    def createMatrixForSink(self, sinkName):
        (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName)
        functionNames = self.uniq(
            ['%s%s' % (self.programDir, c[1]) for c in callsToSink])

        return self.createMatrixForFunctionNames(functionNames)

    """
    This operation looses TF-IDF. I don't think that's the way to go.
    """

    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()

        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()

        functions = [(doc, self.func2SubtreesMap.d[doc])
                     for doc in functionNames]

        for (doc, func) in functions:

            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)

        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap,
                                             self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix

    def _loadFunc2SubtreesMap(self):
        filename = self.projectRoot + 'func2SubtreesMap.pickl'
        self.func2SubtreesMap = pickle.load(file(filename))

    def uniq(self, seq, idfun=None):
        # order preserving
        if idfun is None:

            def idfun(x):
                return x

        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
        return result
Esempio n. 3
0
class SinkMatrixCreator:
    def __init__(self, projectRoot):
        self.projectRoot = projectRoot
        self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/'
        self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../')

    def createMatrixForSink(self, sinkName):
        (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName)
        functionNames = self.uniq([ '%s%s' % (self.programDir, c[1]) for c in callsToSink])
        
        return self.createMatrixForFunctionNames(functionNames)
    
    """
    This operation looses TF-IDF. I don't think that's the way to go.
    """
    
    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()
        
        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()
        
        functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames]
        
        for (doc, func) in functions:
            
            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)
        
        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix
        
    
    def _loadFunc2SubtreesMap(self):
        filename = self.projectRoot + 'func2SubtreesMap.pickl'
        self.func2SubtreesMap = pickle.load(file(filename))
    
    def uniq(self, seq, idfun=None): 
        # order preserving
        if idfun is None:
            def idfun(x): return x
        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
        return result
Esempio n. 4
0
def calculateCheckVectors(WFuncs, CFuncs, F, binary=True, alpha=1, weighByF = False):
    
    WDict = NameToDictMap()
    for (functionLocation, symbols) in WFuncs.d.iteritems():
        
        if not functionLocation in CFuncs.d:
            # The function does not contain any check,
            # thus, projected onto the check-space, it's
            # the NULL-vector
            WDict.d[functionLocation] = {}
            continue
        
        CFunc = CFuncs.d[functionLocation]
        
        for (s,occurrences) in symbols.iteritems():
            if binary: occurrences = 1
            
            if (not s in F):
                # This symbol is never checked
                WDict.setItem(s, functionLocation, 0)
            elif (s in CFunc):
                w = 1.0
                if weighByF: w = F[s]
                nChecks = CFunc[s]
                if binary: nChecks = 1
                WDict.setItem(s, functionLocation, (occurrences - alpha*nChecks)*w)
            else:
                w = 1.0
                if weighByF: w = F[s]
                WDict.setItem(s, functionLocation, occurrences*w)
    return WDict
def relevancyWeighting(checkVectors, featureDir):

    k = 20

    termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl'))
    functionLocations = termDocMatrix.index2Doc

    # it doesn't make much sense that we use euclidean distances here,
    # should be L1, but I can't calculate L1 on the sparse matrices for now.
    from scipy.spatial.distance import squareform
    D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl')))
    anomalyCalculator = AnomalyCalculator()
    (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D)

    WDict = NameToDictMap()
    for i in xrange(len(functionLocations)):

        location = functionLocations[i]
        if not location in checkVectors.d:
            continue

        WDict.d[location] = checkVectors.d[location]

        indices = NNI[:, i]
        gamma = float(numpy.sum(NNV[:, i])) / k
        locations = [functionLocations[j] for j in indices]
        V = [checkVectors.d[l] for l in locations if l in checkVectors.d]
        distances = [
            NNV[j, i] for j in xrange(len(locations))
            if locations[j] in checkVectors.d
        ]

        # len(V) may be unequal to k if at least one of the nearest neighbours has no checks.
        # It is then a null-vector, so we're implicitly adding it in mean-calculation
        meanVector = {}
        for (v, d) in zip(V, distances):

            for (name, score) in v.iteritems():
                try:
                    meanVector[name] += (1 - d) * (float(score) / k)
                except KeyError:
                    meanVector[name] = (1 - d) * (float(score) / k)

        for (name, score) in checkVectors.d[location].iteritems():
            if meanVector.has_key(name):
                score -= meanVector[name]
                if score < 0: score = 0
                WDict.setItem(name, location, score)
    return WDict
Esempio n. 6
0
def relevancyWeighting(checkVectors, featureDir):
    
    k = 20
    
    termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl'))
    functionLocations = termDocMatrix.index2Doc

    # it doesn't make much sense that we use euclidean distances here,
    # should be L1, but I can't calculate L1 on the sparse matrices for now.
    from scipy.spatial.distance import squareform
    D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl')))
    anomalyCalculator = AnomalyCalculator()
    (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D)
    
    WDict = NameToDictMap()
    for i in xrange(len(functionLocations)):
        
        location = functionLocations[i]
        if not location in checkVectors.d:
            continue
        
        WDict.d[location] = checkVectors.d[location]
        
        indices = NNI[:,i]
        gamma = float(numpy.sum(NNV[:,i]))/k
        locations = [functionLocations[j] for j in indices]       
        V = [checkVectors.d[l] for l in locations if l in checkVectors.d]
        distances = [NNV[j,i] for j in xrange(len(locations)) if locations[j] in checkVectors.d]
        
        # len(V) may be unequal to k if at least one of the nearest neighbours has no checks.
        # It is then a null-vector, so we're implicitly adding it in mean-calculation
        meanVector = {}
        for (v,d) in zip(V,distances):
            
            for (name, score) in v.iteritems():
                try:
                    meanVector[name] += (1-d)* (float(score)/k)
                except KeyError:
                    meanVector[name] = (1-d)* (float(score)/k)
        

        for (name, score) in checkVectors.d[location].iteritems():
            if meanVector.has_key(name):
                score -= meanVector[name]
                if score < 0: score = 0
                WDict.setItem(name, location, score)
    return WDict
Esempio n. 7
0
class FeatureArray(object):
    def __init__(self):
        self.vecs = NameToDictMap()
        self.allSymbols = OccurrenceCounter()
    
    def add(self, label, items):
        
        if len(items) == 0:
            self.vecs.add(None, label)
            return
        
        for item in items:
            itemStr = str(item)
            self.vecs.add(itemStr, label)
            self.allSymbols.add(itemStr)
            
    def __iter__(self):
        for x in self.vecs.iteritems():
            yield x
Esempio n. 8
0
    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()

        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()

        functions = [(doc, self.func2SubtreesMap.d[doc])
                     for doc in functionNames]

        for (doc, func) in functions:

            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)

        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap,
                                             self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix
Esempio n. 9
0
    def _termDictsFromContext(self, context, symbol):

        vecs = NameToDictMap()
        allNgrams = OccurrenceCounter()

        context.neighbours.append(context.origin)

        for neighbour in context.neighbours:
            nOcc = neighbour.nOccurrences
            location = neighbour.location
            expressions = self.treeToExprConverter.getExpressionsForSymbol(
                location, symbol)
            # expressions.append('@+$_+@')
            # expressions.append('@+EXPR@+$_+@+@')

            # print 'FOO %s: %s: %s' % (symbol, location, expressions)

            neighbour.setExpressions(expressions)

            # add null-vector for function if it does not contain expressions
            if len(expressions) == 0:
                vecs.add(None, location)

            for expr in expressions:
                # vecs.add(expr, location, 1.0/nOcc)
                # vecs.add(expr, location, 1.0)
                vecs.setItem(expr, location, 1.0)
                allNgrams.add(expr)

        context.neighbours.pop()

        if len(vecs.d) == 0 or len(allNgrams.d) == 0:
            return None

        return (vecs, allNgrams)
def calculateCheckVectors(WFuncs,
                          CFuncs,
                          F,
                          binary=True,
                          alpha=1,
                          weighByF=False):

    WDict = NameToDictMap()
    for (functionLocation, symbols) in WFuncs.d.iteritems():

        if not functionLocation in CFuncs.d:
            # The function does not contain any check,
            # thus, projected onto the check-space, it's
            # the NULL-vector
            WDict.d[functionLocation] = {}
            continue

        CFunc = CFuncs.d[functionLocation]

        for (s, occurrences) in symbols.iteritems():
            if binary: occurrences = 1

            if (not s in F):
                # This symbol is never checked
                WDict.setItem(s, functionLocation, 0)
            elif (s in CFunc):
                w = 1.0
                if weighByF: w = F[s]
                nChecks = CFunc[s]
                if binary: nChecks = 1
                WDict.setItem(s, functionLocation,
                              (occurrences - alpha * nChecks) * w)
            else:
                w = 1.0
                if weighByF: w = F[s]
                WDict.setItem(s, functionLocation, occurrences * w)
    return WDict
Esempio n. 11
0
 def createMatrixForFunctionNames(self, functionNames):
     self._loadFunc2SubtreesMap()
     
     self.nameToDictMap = NameToDictMap()
     self.allSymbolsDict = OccurrenceCounter()
     nameDictMapToMatrix = NameDictMapToMatrix()
     
     functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames]
     
     for (doc, func) in functions:
         
         for (ngram, nOccurrences) in func.iteritems():
             for unused in xrange(nOccurrences):
                 self.nameToDictMap.add(ngram, doc)
                 self.allSymbolsDict.add(ngram)
     
     nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict)
     newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
     return newTermDocMatrix
Esempio n. 12
0
 def __init__(self):
     self.vecs = NameToDictMap()
     self.allSymbols = OccurrenceCounter()