def mergeClustersWithSameFormat(self): # This code performs cluster comparision not as described in the paper, as # it uses the explicit format for finding identical clusters. It does however not # * check whether "constant" in cluster A is the same as in cluster B (in this case they should obviously not be merged!) # * and it does not perform variable/constant considerations as described in the paper # #======================================================================= # print "Trying to merge clusters" # l = [] # for cluster in self.__cluster: # l.append(tuple(cluster.get_format_inference())) # sumUp = Counter(l) # Counts identical format inference tuples # cntMerged = 0 # for key in sumUp.keys(): # Iterate all existing format inference tuples # if sumUp.get(key)>1: # There are clusters with the same format inferred # target = None # for cluster in self.__cluster: # if tuple(cluster.get_format_inference())==key: # if target == None: # target = cluster # else: # # target.add_messages(cluster.get_messages()) # self.__cluster.remove(cluster) # cntMerged += 1 # # self.__cluster.append(target) Not necessary: target is already in cluster # print "Merged ", cntMerged, " clusters with the same format" #======================================================================= # Different approach # Iterate collection and compare each and every cluster representation explicitly # Do not use needle-wunsch here # Tag each mergable cluster with reference to the first cluster and put a merged version of these into the tempcollection # once the whole collection has been traversed. Then remove them from the collection. Continue as long as there is still an item left # in the original collection. # tempCollection will contain all the merged clusters and the unmergable cluster left in the end if len(self.__cluster)==1: return False # We cannot merge a single cluster if not Globals.getConfig().mergeSimilarClusters: logging.info("Cluster merging disabled via configuration") return False copiedCollection = self.__cluster[:] ori_len = len(copiedCollection) tempCollection = ClusterCollection() while len(copiedCollection)>0: mergeCandidates = [] cluster1 = copiedCollection[0] idx_inner = 1 while (idx_inner < len(copiedCollection)): #for idx_inner in range(1,len(copiedCollection)-1): cluster2 = copiedCollection[idx_inner] format1 = cluster1.get_formats() format2 = cluster2.get_formats() if not len(format1)==len(format2): idx_inner += 1 continue # The two clusters have different length [should not happen within subclusters] # Perform token check shouldMerge = True for format_token_idx in range(0,len(format1)-1): token1 = cluster1.get_format(format_token_idx) token2 = cluster2.get_format(format_token_idx) representation = token1[0] fmt_infer = token1[1] semantics = token1[2] if not representation == token2[0]: # Token mismatch --> will not merge shouldMerge = False break checkValues = False if semantics == token2[2]: if len(semantics)==0: # They match because there are no semantics... :-( checkValues = True else: # Semantics mismatch --> will not merge shouldMerge = False break if checkValues: if fmt_infer.getType() == token2[1].getType(): # Check constant/variable cover if fmt_infer.getType()=='const': # Check instance of const value # FIX: Each cluster must have at least 1 message! if not cluster1.get_messages()[0].get_tokenAt(format_token_idx).get_token() == cluster2.get_messages()[0].get_tokenAt(format_token_idx).get_token(): # Const value mismatch --> will not merge shouldMerge = False break else: # Check variable/variable instances # Check for overlap in values. If there is no overlap -> Mismatch allvalues1 = cluster1.get_values_for_token(format_token_idx) allvalues2 = cluster2.get_values_for_token(format_token_idx) if len(set(allvalues1).intersection(set(allvalues2)))==0: # No overlap -> Mismatch shouldMerge = False break else: # Variable/Constant format inference # Check whether variable token takes value of constant one at least once found = True if fmt_infer.getType() == 'const': # Search for cluster1's value in cluster2 cluster1val = cluster1.get_messages()[0].get_tokenAt(format_token_idx).get_token() hits = cluster2.get_messages_with_value_at(format_token_idx,cluster1val) found = len(hits)>0 else: # Search for cluster2's value in cluster1 cluster2val = cluster2.get_messages()[0].get_tokenAt(format_token_idx).get_token() hits = cluster1.get_messages_with_value_at(format_token_idx,cluster2val) found = len(hits)>0 if not found: # No instance of variable in const mismatch --> will not merge shouldMerge = False break # End of token iteration if shouldMerge: mergeCandidates.append(cluster2) idx_inner += 1 # End of for each clusterloop newCluster = Cluster(cluster1.get_representation(), "mergeDestination") newCluster.set_semantics(cluster1.get_semantics()) newCluster.add_messages(cluster1.get_messages()) splitpoint = "" for cluster in mergeCandidates: newCluster.add_messages(cluster.get_messages()) copiedCollection.remove(cluster) splitpoint = "{0}, {1}".format(splitpoint, cluster.getSplitpoint()) newCluster.setSplitpoint(splitpoint) discoverer.formatinference.perform_format_inference_for_cluster(newCluster) # TODO: Build up new semantic information in newCluster copiedCollection.remove(cluster1) tempCollection.add_cluster(newCluster) # Clear own collection self.__cluster = [] # Copy all clusters from tempCollection to our self self.add_clusters(tempCollection.get_all_cluster()) if ori_len == len(self.__cluster): logging.info("No mergable clusters within collection identified") return False else: logging.info("Cluster collection shrunk from {0} to {1} by merging".format(ori_len, len(self.__cluster))) return True