def computeDistances(clustername): path = os.getcwd() + "/Clusters" f=object() size=0 print 'Distances for ' , clustername counter = 0 distance = 0 circle = plt.Circle((0, 0), radius=3, fc='y') plt.gca().add_patch(circle) for files in os.listdir(path): counter = counter + 1 f=open(path+"/"+files,"r") size=len(f.readlines()) distance=e2.eqn2(cluster_features[files],cluster_features[clustername]) if counter%2 == 0 : circle = plt.Circle((counter*10, distance*100), radius=4, fc='r') else : circle = plt.Circle((counter, -distance*100), radius=4, fc='g') plt.gca().add_patch(circle) #plt.annotate(files,(counter, -distance*100)) plt.axis([-10,400,-100,100]) plt.ylabel('Distances from ' + clustername) plt.xlabel("cluster number") plt.title("Intercluster distances") plt.pause(0.1) plt.show()
def refine(): global cluster_centres global cluster_centre_message_names global cluster_centre_message_types global cluster_centre_types_used filename = '' x = [] j = 0 scores = [] score = 0 mn = [] #Message names mt = [] #Message types tu = [] #types used bmn = [] #base Message names bmt = [] #base Message types btu = [] #base types used path = '/home/kaushik/Desktop/web services project/' command = "" f = object() for fl in os.listdir( "/home/kaushik/Desktop/web services project/Clusters"): f = open("Clusters/" + fl, "r") x = f.readlines() if len(x) == 1: command = "rm " #print x filename = x[0] filename = filename[0:len(filename) - 1] mn = cluster_centre_message_names[path + "wsdl dataset/" + filename] mt = cluster_centre_message_types[path + "wsdl dataset/" + filename] tu = cluster_centre_types_used[path + "wsdl dataset/" + filename] scores = [] for centre in cluster_centres: bmn = cluster_centre_message_names[centre] bmt = cluster_centre_message_types[centre] btu = cluster_centre_types_used[centre] score = 2.5 * e2.eqn2(bmn, mn) + e2.eqn2( bmt, mt) + 0.5 * e2.eqn2(btu, tu) scores = scores + [score] j = scores.index(max(scores)) f = open("Clusters/cluster" + str(j) + ".txt", "a") f.write(filename + "\n") command = command + "Clusters/" + fl print "Removing :", fl os.system(command)
def refine(): global cluster_centres global cluster_centre_message_names global cluster_centre_message_types global cluster_centre_types_used filename = '' x=[] j=0 scores = [] score = 0 mn = [] #Message names mt = [] #Message types tu = [] #types used bmn = [] #base Message names bmt = [] #base Message types btu = [] #base types used path = '/home/kaushik/Desktop/web services project/' command = "" f=object() for fl in os.listdir("/home/kaushik/Desktop/web services project/Clusters"): f=open("Clusters/"+fl,"r") x = f.readlines() if len(x) == 1: command = "rm " #print x filename = x[0] filename = filename[0:len(filename)-1] mn = cluster_centre_message_names[path+"wsdl dataset/" + filename] mt = cluster_centre_message_types[path+"wsdl dataset/" +filename] tu = cluster_centre_types_used[path+"wsdl dataset/" +filename] scores = [] for centre in cluster_centres : bmn = cluster_centre_message_names[centre] bmt = cluster_centre_message_types[centre] btu = cluster_centre_types_used[centre] score = 2.5*e2.eqn2(bmn,mn) + e2.eqn2(bmt,mt) + 0.5*e2.eqn2(btu,tu) scores = scores + [score] j = scores.index(max(scores)) f=open("Clusters/cluster"+str(j)+".txt","a") f.write(filename+"\n") command = command + "Clusters/" + fl print "Removing :" , fl os.system(command)
def hierarchical_clustering(): # does hierarchical clustering g = nx.DiGraph() list_of_nodes = os.listdir(os.getcwd() + "/Clusters") no_of_hier_nodes = 0 score = 0 g.add_node("Entire Dataset", size=2000) for items in list_of_nodes: g.add_node(str(no_of_hier_nodes), size=2000) g.add_edge("Entire Dataset", str(no_of_hier_nodes)) node_sizes["Entire Dataset"] = 2000 if items not in g.nodes(): g.add_node(cluster_names[items], size=2000) node_sizes[cluster_names[items]] = 2000 g.add_edge(str(no_of_hier_nodes), cluster_names[items]) for node in list_of_nodes: if node != items and node not in g.nodes(): score = e2.eqn2(cluster_features[items], cluster_features[node]) if score > 0.3: g.add_node(cluster_names[node], size=2000) node_sizes[cluster_names[node]] = 2000 g.add_edge(str(no_of_hier_nodes), cluster_names[node]) list_of_nodes.remove(node) list_of_nodes.remove(items) no_of_hier_nodes = no_of_hier_nodes + 1 T = nx.bfs_tree(g, source="Entire Dataset") nx.draw(g) pylab.show() g = T nx.set_node_attributes(g, "size", node_sizes) t = json_graph.tree_data(T, root="Entire Dataset") x = str(t) x = re.sub("\'id\'", "\'name\'", x) x = re.sub("\'", "\"", x) #x=re.sub("\'","\"",x) #x=re.sub("\"name\"[\:] \"(.)*\"","\"name\"[:] \"(.)*\" \"size\": 2000",x) fl = open("graph.json", "w") fl.write(x) fl.close() print x
def hierarchical_clustering(): # does hierarchical clustering g=nx.DiGraph() list_of_nodes = os.listdir(os.getcwd() + "/Clusters") no_of_hier_nodes = 0 score = 0 g.add_node("Entire Dataset",size=2000) for items in list_of_nodes : g.add_node(str(no_of_hier_nodes),size=2000) g.add_edge("Entire Dataset",str(no_of_hier_nodes)) node_sizes["Entire Dataset"] = 2000 if items not in g.nodes() : g.add_node(cluster_names[items],size=2000) node_sizes[cluster_names[items]] = 2000 g.add_edge(str(no_of_hier_nodes),cluster_names[items]) for node in list_of_nodes : if node != items and node not in g.nodes(): score = e2.eqn2(cluster_features[items],cluster_features[node]) if score > 0.3 : g.add_node(cluster_names[node],size=2000) node_sizes[cluster_names[node]] = 2000 g.add_edge(str(no_of_hier_nodes),cluster_names[node]) list_of_nodes.remove(node) list_of_nodes.remove(items) no_of_hier_nodes = no_of_hier_nodes + 1 T=nx.bfs_tree(g,source="Entire Dataset") nx.draw(g) pylab.show() g=T nx.set_node_attributes(g,"size",node_sizes) t=json_graph.tree_data(T,root="Entire Dataset") x=str(t) x=re.sub("\'id\'","\'name\'",x) x=re.sub("\'","\"",x) #x=re.sub("\'","\"",x) #x=re.sub("\"name\"[\:] \"(.)*\"","\"name\"[:] \"(.)*\" \"size\": 2000",x) fl = open("graph.json","w") fl.write(x) fl.close() print x
def hierarchical_clustering(): # does hierarchical clustering g = nx.Graph() g1 = nx.Graph() g.add_node("Entire Dataset") if "Hierarchies.csv" in os.listdir('.'): os.system("rm Hierarchies.csv") hierarchies = open("Hierarchies.csv", "a") list_of_clusters = os.listdir( '/home/kaushik/Desktop/web services project/Clusters') similar_clusters = [] name = '' all_clusters = '' fdist = object() no_of_hier = 0 for x in list_of_clusters: similar_clusters = [] similar_clusters = similar_clusters + [x] list_of_clusters.remove(x) name = '' all_clusters = '' if len(list_of_clusters) != 0: for y in list_of_clusters: if e2.eqn2(cluster_features[x], cluster_features[y]) > 0.3: similar_clusters = similar_clusters + [y] list_of_clusters.remove(y) no_of_hier = no_of_hier + 1 g.add_nodes_from(similar_clusters) g.add_node(no_of_hier) for node in similar_clusters: g.add_edge(no_of_hier, node) all_clusters = all_clusters + node + " " hierarchies.write(str(no_of_hier) + "," + all_clusters + "\n") g.add_edge(no_of_hier, "Entire Dataset") #list_of_clusters.remove(x) print similar_clusters nx.draw(g) pylab.show()
def cluster(): global cluster_centres global cluster_centre_message_names global cluster_centre_message_types global cluster_centre_types_used f=object() filename = '' counter = 0 score = 0 i=0 bmn = [] #base Message names bmt = [] #base Message types btu = [] #base types used bns = [] mn = [] #Message names mt = [] #Message types tu = [] #types used ns = [] scores = [] path = '/home/kaushik/Desktop/web services project/wsdl dataset' for fl in os.listdir('/home/kaushik/Desktop/web services project/wsdl dataset'): filename = path+ "/" + fl counter = counter + 1 #if re.search('str',str(type(wa.findName(filename)))): print counter , " : " , len(cluster_centres) if counter == 1: cluster_centres = cluster_centres + [filename] cluster_centre_message_names[filename]=wa.findMessageNames(filename) cluster_centre_message_types[filename]=wa.findMessageTypes(filename) cluster_centre_types_used[filename]=wa.getTypes(filename) f=open("Clusters/cluster"+str(len(cluster_centres))+".txt","a") f.write(fl+"\n") else : mn = wa.findMessageNames(filename) mt = wa.findMessageTypes(filename) tu = wa.getTypes(filename) #ns = wa.breakName(wa.findName(filename)) i=0 scores = [] for centre in cluster_centres : if filename in cluster_centres : break bmn = cluster_centre_message_names[centre] bmt = cluster_centre_message_types[centre] btu = cluster_centre_types_used[centre] #bns = wa.breakName(wa.findName(centre)) score = 2.5*e2.eqn2(bmn,mn) + e2.eqn2(bmt,mt) + 0.5*e2.eqn2(btu,tu) #print score scores = scores + [score] if len(scores) > 0 and max(scores) > 1.1 : i=scores.index(max(scores)) f=open("Clusters/cluster"+str(i)+".txt","a") f.write(fl+"\n") else : cluster_centres = cluster_centres + [filename] cluster_centre_message_names[filename]=wa.findMessageNames(filename) cluster_centre_message_types[filename]=wa.findMessageTypes(filename) cluster_centre_types_used[filename]=wa.getTypes(filename) f=open("Clusters/cluster"+str(len(cluster_centres))+".txt","a") f.write(fl+"\n") print len(cluster_centres)
def cluster(): global cluster_centres global cluster_centre_message_names global cluster_centre_message_types global cluster_centre_types_used f = object() filename = '' counter = 0 score = 0 i = 0 bmn = [] #base Message names bmt = [] #base Message types btu = [] #base types used bns = [] mn = [] #Message names mt = [] #Message types tu = [] #types used ns = [] scores = [] path = '/home/kaushik/Desktop/web services project/wsdl dataset' for fl in os.listdir( '/home/kaushik/Desktop/web services project/wsdl dataset'): filename = path + "/" + fl counter = counter + 1 if re.search('str', str(type(wa.findName(filename)))): print counter, " : ", len(cluster_centres) if counter == 1: cluster_centres = cluster_centres + [filename] cluster_centre_message_names[filename] = wa.findMessageNames( filename) cluster_centre_message_types[filename] = wa.findMessageTypes( filename) cluster_centre_types_used[filename] = wa.getTypes(filename) f = open( "Clusters/cluster" + str(len(cluster_centres)) + ".txt", "a") f.write(fl + "\n") else: mn = wa.findMessageNames(filename) mt = wa.findMessageTypes(filename) tu = wa.getTypes(filename) ns = wa.breakName(wa.findName(filename)) i = 0 scores = [] for centre in cluster_centres: if filename in cluster_centres: break bmn = cluster_centre_message_names[centre] bmt = cluster_centre_message_types[centre] btu = cluster_centre_types_used[centre] #bns = wa.breakName(wa.findName(centre)) score = 2.5 * e2.eqn2(bmn, mn) + e2.eqn2( bmt, mt) + 0.5 * e2.eqn2(btu, tu) #print score scores = scores + [score] if max(scores) > 0.9: i = scores.index(max(scores)) f = open("Clusters/cluster" + str(i) + ".txt", "a") f.write(fl + "\n") else: cluster_centres = cluster_centres + [filename] cluster_centre_message_names[ filename] = wa.findMessageNames(filename) cluster_centre_message_types[ filename] = wa.findMessageTypes(filename) cluster_centre_types_used[filename] = wa.getTypes(filename) f = open( "Clusters/cluster" + str(len(cluster_centres)) + ".txt", "a") f.write(fl + "\n") print len(cluster_centres)