コード例 #1
0
    def mergeClustersWithSameFormat(self):
        # This code performs cluster comparision not as described in the paper, as
        # it uses the explicit format for finding identical clusters. It does however not
        # * check whether "constant" in cluster A is the same as in cluster B (in this case they should obviously not be merged!)
        # * and it does not perform variable/constant considerations as described in the paper
        # 
        #=======================================================================
        # print "Trying to merge clusters"
        # l = []
        # for cluster in self.__cluster:
        #    l.append(tuple(cluster.get_format_inference()))
        # sumUp = Counter(l) # Counts identical format inference tuples
        # cntMerged = 0
        # for key in sumUp.keys(): # Iterate all existing format inference tuples
        #    if sumUp.get(key)>1: # There are clusters with the same format inferred
        #        target = None
        #        for cluster in self.__cluster:
        #            if tuple(cluster.get_format_inference())==key:
        #                if target == None:
        #                    target = cluster
        #                else:
        #                    
        #                    target.add_messages(cluster.get_messages())
        #                    self.__cluster.remove(cluster)
        #                    cntMerged += 1    
        #        # self.__cluster.append(target) Not necessary: target is already in cluster
        #        print "Merged ", cntMerged, " clusters with the same format"
        #=======================================================================
        
        # Different approach
        # Iterate collection and compare each and every cluster representation explicitly
        # Do not use needle-wunsch here
        
        # Tag each mergable cluster with reference to the first cluster and put a merged version of these into the tempcollection
        # once the whole collection has been traversed. Then remove them from the collection. Continue as long as there is still an item left
        # in the original collection.
        # tempCollection will contain all the merged clusters and the unmergable cluster left in the end
        
        if len(self.__cluster)==1:
            return False # We cannot merge a single cluster
        
        if not Globals.getConfig().mergeSimilarClusters:
            logging.info("Cluster merging disabled via configuration")

            return False
        
        copiedCollection = self.__cluster[:]  
        ori_len = len(copiedCollection)
        tempCollection = ClusterCollection()

        while len(copiedCollection)>0:            
            mergeCandidates = []            
            cluster1 = copiedCollection[0]
            idx_inner = 1
            while (idx_inner < len(copiedCollection)):             
            #for idx_inner in range(1,len(copiedCollection)-1):    
                
                cluster2 = copiedCollection[idx_inner]
                format1 = cluster1.get_formats()
                format2 = cluster2.get_formats()
                if not len(format1)==len(format2):
                    idx_inner += 1
                    continue # The two clusters have different length [should not happen within subclusters]
                # Perform token check
                shouldMerge = True
                for format_token_idx in range(0,len(format1)-1):
                    token1 = cluster1.get_format(format_token_idx)
                    token2 = cluster2.get_format(format_token_idx)
                    representation = token1[0]
                    fmt_infer = token1[1]
                    semantics = token1[2]
                    if not representation == token2[0]: # Token mismatch --> will not merge
                        shouldMerge = False
                        break
                    
                    checkValues = False
                    if semantics == token2[2]:
                        if len(semantics)==0: # They match because there are no semantics... :-(
                            checkValues = True 
                    else: # Semantics mismatch --> will not merge
                        shouldMerge = False
                        break
                    
                    
                    if checkValues:
                        if fmt_infer.getType() == token2[1].getType():
                            # Check constant/variable cover
                            if fmt_infer.getType()=='const': 
                                # Check instance of const value
                                # FIX: Each cluster must have at least 1 message!
                                if not cluster1.get_messages()[0].get_tokenAt(format_token_idx).get_token() == cluster2.get_messages()[0].get_tokenAt(format_token_idx).get_token():
                                    # Const value mismatch --> will not merge
                                    shouldMerge = False
                                    break
                            else:
                                # Check variable/variable instances
                                # Check for overlap in values. If there is no overlap -> Mismatch
                                allvalues1 = cluster1.get_values_for_token(format_token_idx)
                                allvalues2 = cluster2.get_values_for_token(format_token_idx)
                                if len(set(allvalues1).intersection(set(allvalues2)))==0:
                                    # No overlap -> Mismatch
                                    shouldMerge = False
                                    break
                            
                        else:
                            # Variable/Constant format inference
                            # Check whether variable token takes value of constant one at least once
                            found = True
                            if fmt_infer.getType() == 'const':
                                # Search for cluster1's value in cluster2
                                cluster1val = cluster1.get_messages()[0].get_tokenAt(format_token_idx).get_token()
                                hits = cluster2.get_messages_with_value_at(format_token_idx,cluster1val)
                                found = len(hits)>0
                            else:
                                # Search for cluster2's value in cluster1
                                cluster2val = cluster2.get_messages()[0].get_tokenAt(format_token_idx).get_token()
                                hits = cluster1.get_messages_with_value_at(format_token_idx,cluster2val)
                                found = len(hits)>0
                            if not found:
                                # No instance of variable in const mismatch --> will not merge
                                shouldMerge = False
                                break
            
            
                               
                # End of token iteration
                if shouldMerge:    
                    mergeCandidates.append(cluster2)
                idx_inner += 1     
            # End of for each clusterloop
            
            newCluster = Cluster(cluster1.get_representation(), "mergeDestination")
            newCluster.set_semantics(cluster1.get_semantics())             
            newCluster.add_messages(cluster1.get_messages())
            splitpoint = ""
            for cluster in mergeCandidates:                    
                newCluster.add_messages(cluster.get_messages())
                copiedCollection.remove(cluster)
                splitpoint = "{0}, {1}".format(splitpoint, cluster.getSplitpoint())
            newCluster.setSplitpoint(splitpoint)
            discoverer.formatinference.perform_format_inference_for_cluster(newCluster)    
            # TODO: Build up new semantic information in newCluster
            copiedCollection.remove(cluster1)               
            tempCollection.add_cluster(newCluster)            
                
        # Clear own collection
        self.__cluster = []
        # Copy all clusters from tempCollection to our self
        self.add_clusters(tempCollection.get_all_cluster())
        if ori_len == len(self.__cluster):
            logging.info("No mergable clusters within collection identified")
            return False
        else:
            logging.info("Cluster collection shrunk from {0} to {1} by merging".format(ori_len, len(self.__cluster)))
            return True