def _filter(self, onlyForCallSite): self.modelSelector = ModelSelector() self.modelSelector.setCallSite(onlyForCallSite) self.modelSelector.setMinNumMembers(self.minNumMembersInInvocCluster) self.modelSelector.setTopN(self.topnInvocClusters) self.models = self.modelSelector.select(self.models)
class CheckOverlayCreator: def __init__(self, contentProvider): self.contentProvider = contentProvider self.minCondObserved = DEFAULT_MIN_COND_OBSERVED self.topnCheckHist = DEFAULT_TOP_N_CHECK_HIST self.minFracChecks = DEFAULT_MIN_FRAC_CHECKS def setMinCondObserved(self, val): self.minCondObserved = val def setTopnCheckHist(self, val): self.topnCheckHist = val def setMinFracChecks(self, val): self.minFracChecks = val def createForModels(self, models, onlyForSubChecks = None): self.models = [CheckModel(model) for model in models] self._generateChecksForAllModels() self._retrieveConditions() self._distributeChecksPerArg() self._createConditionClusters() for model in self.models: self.createOverlayForModel(model) self._filter(onlyForSubChecks) # self._generateLabels() def _generateLabels(self): # TODO: Optimization: It's probably possible here to generate node-labels only # for those ASTs that are part of one of the models still left. self.conditionLabels = self.contentProvider.getAllASTNodeLabels() for model in self.models: model.extractCommonLabels(self.nodeIdToConditionIndex, self.conditionLabels, self.minFracChecks) def _createConditionClusters(self): self.cndClusterTool = ConditionClusterer(self.contentProvider) self.conditionClusters = self.cndClusterTool.cluster(self.models) def createOverlayForModel(self, model): # Save references to global condition data. model.setConditionClusters(self.conditionClusters) model.conditionsCode = self.conditionsCode model.nodeIdToConditionIndex = self.nodeIdToConditionIndex model.generateCheckHist() model.pruneCheckHist(self.topnCheckHist, self.minCondObserved) def getModels(self): return self.models def _generateChecksForAllModels(self): l = [m.members for m in self.models] invocs = uniq(flatten(l)) self.contentProvider.generateChecksForInvocations(invocs) def _retrieveConditions(self): self.conditions = self.contentProvider.getAllConditions() self.nodeIdToConditionIndex = {} for i in range(len(self.conditions)): self.nodeIdToConditionIndex[self.conditions[i]] = i self.conditions = list(set(self.conditions)) self.conditionsCode = self.contentProvider.getAllConditionsCode() def _distributeChecksPerArg(self): self.checksPerArg = self.contentProvider.getAllChecksPerArg() # list: index is invocation index for i in range(len(self.models)): for j in range(len(self.models[i].members)): self.models[i].checks.append(self.checksPerArg[self.models[i].members[j]]) # self.models[i].checks.append(self.checksPerArg[j]) def _filter(self, onlyForSubChecks): self.modelSelector = ModelSelector() self.modelSelector.setSubChecks(onlyForSubChecks) self.models = self.modelSelector.selectForChecks(self.models)
def _filter(self, onlyForSubChecks): self.modelSelector = ModelSelector() self.modelSelector.setSubChecks(onlyForSubChecks) self.models = self.modelSelector.selectForChecks(self.models)
class CheckOverlayCreator: def __init__(self, contentProvider): self.contentProvider = contentProvider self.minCondObserved = DEFAULT_MIN_COND_OBSERVED self.topnCheckHist = DEFAULT_TOP_N_CHECK_HIST self.minFracChecks = DEFAULT_MIN_FRAC_CHECKS def setMinCondObserved(self, val): self.minCondObserved = val def setTopnCheckHist(self, val): self.topnCheckHist = val def setMinFracChecks(self, val): self.minFracChecks = val def createForModels(self, models, onlyForSubChecks=None): self.models = [CheckModel(model) for model in models] self._generateChecksForAllModels() self._retrieveConditions() self._distributeChecksPerArg() self._createConditionClusters() for model in self.models: self.createOverlayForModel(model) self._filter(onlyForSubChecks) # self._generateLabels() def _generateLabels(self): # TODO: Optimization: It's probably possible here to generate node-labels only # for those ASTs that are part of one of the models still left. self.conditionLabels = self.contentProvider.getAllASTNodeLabels() for model in self.models: model.extractCommonLabels(self.nodeIdToConditionIndex, self.conditionLabels, self.minFracChecks) def _createConditionClusters(self): self.cndClusterTool = ConditionClusterer(self.contentProvider) self.conditionClusters = self.cndClusterTool.cluster(self.models) def createOverlayForModel(self, model): # Save references to global condition data. model.setConditionClusters(self.conditionClusters) model.conditionsCode = self.conditionsCode model.nodeIdToConditionIndex = self.nodeIdToConditionIndex model.generateCheckHist() model.pruneCheckHist(self.topnCheckHist, self.minCondObserved) def getModels(self): return self.models def _generateChecksForAllModels(self): l = [m.members for m in self.models] invocs = uniq(flatten(l)) self.contentProvider.generateChecksForInvocations(invocs) def _retrieveConditions(self): self.conditions = self.contentProvider.getAllConditions() self.nodeIdToConditionIndex = {} for i in range(len(self.conditions)): self.nodeIdToConditionIndex[self.conditions[i]] = i self.conditions = list(set(self.conditions)) self.conditionsCode = self.contentProvider.getAllConditionsCode() def _distributeChecksPerArg(self): self.checksPerArg = self.contentProvider.getAllChecksPerArg() # list: index is invocation index for i in range(len(self.models)): for j in range(len(self.models[i].members)): self.models[i].checks.append( self.checksPerArg[self.models[i].members[j]]) # self.models[i].checks.append(self.checksPerArg[j]) def _filter(self, onlyForSubChecks): self.modelSelector = ModelSelector() self.modelSelector.setSubChecks(onlyForSubChecks) self.models = self.modelSelector.selectForChecks(self.models)
class DataFlowModelCreator: def __init__(self, contentProvider): self.contentProvider = contentProvider self.sourceDistInCluster = DEFAULT_SOURCE_DIST_IN_CLUSTER self.invocDistInCluster = DEFAULT_INVOC_DIST_IN_CLUSTER self.minNumMembersInInvocCluster = DEFAULT_MIN_NUM_MEMBERS_IN_INVOC_CLUSTER self.topnInvocClusters = DEFAULT_TOPN_INVOC_CLUSTERS def setSourceDistInCluster(self, val): self.sourceDistInCluster = val def setInvocDistInCluster(self, val): self.invocDistInCluster = val def setMinNumMemersInInvocCluster(self, val): self.minNumMembersInInvocCluster = val def setTopnInvocClusters(self, val): self.topnInvocClusters = val def createDataFlowModels(self, sinkSymbol, onlyForCallSite = None): self.selector = self._sinkSymbolToSelector(sinkSymbol) self._createForSelector() self._filter(onlyForCallSite) for model in self.models: model.calculateSourcesPerArg(self.invocClusterTool) def getModels(self): return self.models def _sinkSymbolToSelector(self, sinkSymbol): return 'getCallsTo("%s")' % (sinkSymbol) def _createForSelector(self): self.contentProvider.generate(self.selector) # Cluster source API symbols using jaro distance. self.sourceClusterer = SourceClusterer(self.contentProvider) self.sourceClusterer.setMaxDistInCluster(self.sourceDistInCluster) sourceClusters = self.sourceClusterer.cluster() # Cluster invocations based on source-argument mappings. self.invocClusterTool = InvocationClusterer(self.contentProvider) self.invocClusterTool.setMaxDistInCluster(self.invocDistInCluster) self.invocClusters = self.invocClusterTool.cluster(sourceClusters) self._createModelsFromInvocClusters(self.invocClusters) def _createModelsFromInvocClusters(self, invocClusters): X = invocClusters.dataMatrix self.models = [] for (clusterId, invocIds) in invocClusters.clusterIdToDatapoint.iteritems(): newModel = DataFlowModel() newModel.clusterId = clusterId newModel.members = invocIds # We need to add 1 here for the 'other' group newModel.setNumberOfArguments(invocClusters.getNumberOfArguments() + 1) newModel.callSiteIds = [invocClusters.callSiteIds[x] for x in invocIds] newModel.sharedSourceClusters = np.nonzero(np.sum(X[:, tuple(newModel.members)], axis=1) > 0.5* len(invocIds))[0] newModel.selector = self.selector self.models.append(newModel) # Sort models by number of members self.models.sort(key=lambda x: len(x.members), reverse=True) def _filter(self, onlyForCallSite): self.modelSelector = ModelSelector() self.modelSelector.setCallSite(onlyForCallSite) self.modelSelector.setMinNumMembers(self.minNumMembersInInvocCluster) self.modelSelector.setTopN(self.topnInvocClusters) self.models = self.modelSelector.select(self.models)
class DataFlowModelCreator: def __init__(self, contentProvider): self.contentProvider = contentProvider self.sourceDistInCluster = DEFAULT_SOURCE_DIST_IN_CLUSTER self.invocDistInCluster = DEFAULT_INVOC_DIST_IN_CLUSTER self.minNumMembersInInvocCluster = DEFAULT_MIN_NUM_MEMBERS_IN_INVOC_CLUSTER self.topnInvocClusters = DEFAULT_TOPN_INVOC_CLUSTERS def setSourceDistInCluster(self, val): self.sourceDistInCluster = val def setInvocDistInCluster(self, val): self.invocDistInCluster = val def setMinNumMemersInInvocCluster(self, val): self.minNumMembersInInvocCluster = val def setTopnInvocClusters(self, val): self.topnInvocClusters = val def createDataFlowModels(self, sinkSymbol, onlyForCallSite=None): self.selector = self._sinkSymbolToSelector(sinkSymbol) self._createForSelector() self._filter(onlyForCallSite) for model in self.models: model.calculateSourcesPerArg(self.invocClusterTool) def getModels(self): return self.models def _sinkSymbolToSelector(self, sinkSymbol): return 'getCallsTo("%s")' % (sinkSymbol) def _createForSelector(self): self.contentProvider.generate(self.selector) # Cluster source API symbols using jaro distance. self.sourceClusterer = SourceClusterer(self.contentProvider) self.sourceClusterer.setMaxDistInCluster(self.sourceDistInCluster) sourceClusters = self.sourceClusterer.cluster() # Cluster invocations based on source-argument mappings. self.invocClusterTool = InvocationClusterer(self.contentProvider) self.invocClusterTool.setMaxDistInCluster(self.invocDistInCluster) self.invocClusters = self.invocClusterTool.cluster(sourceClusters) self._createModelsFromInvocClusters(self.invocClusters) def _createModelsFromInvocClusters(self, invocClusters): X = invocClusters.dataMatrix self.models = [] for (clusterId, invocIds) in invocClusters.clusterIdToDatapoint.iteritems(): newModel = DataFlowModel() newModel.clusterId = clusterId newModel.members = invocIds # We need to add 1 here for the 'other' group newModel.setNumberOfArguments( invocClusters.getNumberOfArguments() + 1) newModel.callSiteIds = [ invocClusters.callSiteIds[x] for x in invocIds ] newModel.sharedSourceClusters = np.nonzero( np.sum(X[:, tuple(newModel.members)], axis=1) > 0.5 * len(invocIds))[0] newModel.selector = self.selector self.models.append(newModel) # Sort models by number of members self.models.sort(key=lambda x: len(x.members), reverse=True) def _filter(self, onlyForCallSite): self.modelSelector = ModelSelector() self.modelSelector.setCallSite(onlyForCallSite) self.modelSelector.setMinNumMembers(self.minNumMembersInInvocCluster) self.modelSelector.setTopN(self.topnInvocClusters) self.models = self.modelSelector.select(self.models)