コード例 #1
0
    def _termDictsFromContext(self, context, symbol):
        
        vecs = NameToDictMap()
        allNgrams = OccurrenceCounter()
        
        context.neighbours.append(context.origin)
        
        for neighbour in context.neighbours:
            nOcc = neighbour.nOccurrences
            location = neighbour.location
            expressions = self.treeToExprConverter.getExpressionsForSymbol(location, symbol)
            # expressions.append('@+$_+@')
            # expressions.append('@+EXPR@+$_+@+@')
            
            # print 'FOO %s: %s: %s' % (symbol, location, expressions)
            
            neighbour.setExpressions(expressions)

            # add null-vector for function if it does not contain expressions
            if len(expressions) == 0:
                vecs.add(None, location)
            
            for expr in expressions:
                # vecs.add(expr, location, 1.0/nOcc)
                # vecs.add(expr, location, 1.0)
                vecs.setItem(expr, location, 1.0)
                allNgrams.add(expr)            
        
        context.neighbours.pop()
        
        if len(vecs.d) == 0 or len(allNgrams.d) == 0:
            return None
    
        return (vecs, allNgrams)
コード例 #2
0
    def _termDictsFromContext(self, context, symbol):

        vecs = NameToDictMap()
        allNgrams = OccurrenceCounter()

        context.neighbours.append(context.origin)

        for neighbour in context.neighbours:
            nOcc = neighbour.nOccurrences
            location = neighbour.location
            expressions = self.treeToExprConverter.getExpressionsForSymbol(
                location, symbol)
            # expressions.append('@+$_+@')
            # expressions.append('@+EXPR@+$_+@+@')

            # print 'FOO %s: %s: %s' % (symbol, location, expressions)

            neighbour.setExpressions(expressions)

            # add null-vector for function if it does not contain expressions
            if len(expressions) == 0:
                vecs.add(None, location)

            for expr in expressions:
                # vecs.add(expr, location, 1.0/nOcc)
                # vecs.add(expr, location, 1.0)
                vecs.setItem(expr, location, 1.0)
                allNgrams.add(expr)

        context.neighbours.pop()

        if len(vecs.d) == 0 or len(allNgrams.d) == 0:
            return None

        return (vecs, allNgrams)
コード例 #3
0
class SinkMatrixCreator:
    def __init__(self, projectRoot):
        self.projectRoot = projectRoot
        self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/'
        self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../')

    def createMatrixForSink(self, sinkName):
        (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName)
        functionNames = self.uniq(
            ['%s%s' % (self.programDir, c[1]) for c in callsToSink])

        return self.createMatrixForFunctionNames(functionNames)

    """
    This operation looses TF-IDF. I don't think that's the way to go.
    """

    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()

        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()

        functions = [(doc, self.func2SubtreesMap.d[doc])
                     for doc in functionNames]

        for (doc, func) in functions:

            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)

        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap,
                                             self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix

    def _loadFunc2SubtreesMap(self):
        filename = self.projectRoot + 'func2SubtreesMap.pickl'
        self.func2SubtreesMap = pickle.load(file(filename))

    def uniq(self, seq, idfun=None):
        # order preserving
        if idfun is None:

            def idfun(x):
                return x

        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
        return result
コード例 #4
0
class SinkMatrixCreator:
    def __init__(self, projectRoot):
        self.projectRoot = projectRoot
        self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/'
        self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../')

    def createMatrixForSink(self, sinkName):
        (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName)
        functionNames = self.uniq([ '%s%s' % (self.programDir, c[1]) for c in callsToSink])
        
        return self.createMatrixForFunctionNames(functionNames)
    
    """
    This operation looses TF-IDF. I don't think that's the way to go.
    """
    
    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()
        
        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()
        
        functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames]
        
        for (doc, func) in functions:
            
            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)
        
        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix
        
    
    def _loadFunc2SubtreesMap(self):
        filename = self.projectRoot + 'func2SubtreesMap.pickl'
        self.func2SubtreesMap = pickle.load(file(filename))
    
    def uniq(self, seq, idfun=None): 
        # order preserving
        if idfun is None:
            def idfun(x): return x
        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if marker in seen: continue
            seen[marker] = 1
            result.append(item)
        return result
コード例 #5
0
class FeatureArray(object):
    def __init__(self):
        self.vecs = NameToDictMap()
        self.allSymbols = OccurrenceCounter()
    
    def add(self, label, items):
        
        if len(items) == 0:
            self.vecs.add(None, label)
            return
        
        for item in items:
            itemStr = str(item)
            self.vecs.add(itemStr, label)
            self.allSymbols.add(itemStr)
            
    def __iter__(self):
        for x in self.vecs.iteritems():
            yield x
コード例 #6
0
    def createMatrixForFunctionNames(self, functionNames):
        self._loadFunc2SubtreesMap()

        self.nameToDictMap = NameToDictMap()
        self.allSymbolsDict = OccurrenceCounter()
        nameDictMapToMatrix = NameDictMapToMatrix()

        functions = [(doc, self.func2SubtreesMap.d[doc])
                     for doc in functionNames]

        for (doc, func) in functions:

            for (ngram, nOccurrences) in func.iteritems():
                for unused in xrange(nOccurrences):
                    self.nameToDictMap.add(ngram, doc)
                    self.allSymbolsDict.add(ngram)

        nameDictMapToMatrix.convertFromDicts(self.nameToDictMap,
                                             self.allSymbolsDict)
        newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
        return newTermDocMatrix
コード例 #7
0
 def createMatrixForFunctionNames(self, functionNames):
     self._loadFunc2SubtreesMap()
     
     self.nameToDictMap = NameToDictMap()
     self.allSymbolsDict = OccurrenceCounter()
     nameDictMapToMatrix = NameDictMapToMatrix()
     
     functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames]
     
     for (doc, func) in functions:
         
         for (ngram, nOccurrences) in func.iteritems():
             for unused in xrange(nOccurrences):
                 self.nameToDictMap.add(ngram, doc)
                 self.allSymbolsDict.add(ngram)
     
     nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict)
     newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix
     return newTermDocMatrix
コード例 #8
0
 def __init__(self):
     self.vecs = NameToDictMap()
     self.allSymbols = OccurrenceCounter()