Esempio n. 1
0
def computeDistances(clustername):
 path = os.getcwd() +  "/Clusters"
 f=object()
 size=0
 print 'Distances for ' , clustername
 counter = 0
 distance = 0
 circle = plt.Circle((0, 0), radius=3, fc='y')
 plt.gca().add_patch(circle)
 for files in os.listdir(path):   
   counter = counter + 1
   f=open(path+"/"+files,"r")
   size=len(f.readlines())
   distance=e2.eqn2(cluster_features[files],cluster_features[clustername])
   if counter%2 == 0 :
    circle = plt.Circle((counter*10, distance*100), radius=4, fc='r')  
   else :
    circle = plt.Circle((counter, -distance*100), radius=4, fc='g')   
   plt.gca().add_patch(circle)
   #plt.annotate(files,(counter, -distance*100))
   plt.axis([-10,400,-100,100])
   plt.ylabel('Distances from ' + clustername)
   plt.xlabel("cluster number")
   plt.title("Intercluster distances")
   plt.pause(0.1)
 plt.show()
def refine():
    global cluster_centres
    global cluster_centre_message_names
    global cluster_centre_message_types
    global cluster_centre_types_used
    filename = ''
    x = []
    j = 0
    scores = []
    score = 0
    mn = []  #Message names
    mt = []  #Message types
    tu = []  #types used
    bmn = []  #base Message names
    bmt = []  #base Message types
    btu = []  #base types used
    path = '/home/kaushik/Desktop/web services project/'
    command = ""
    f = object()
    for fl in os.listdir(
            "/home/kaushik/Desktop/web services project/Clusters"):
        f = open("Clusters/" + fl, "r")
        x = f.readlines()
        if len(x) == 1:
            command = "rm "
            #print x
            filename = x[0]
            filename = filename[0:len(filename) - 1]
            mn = cluster_centre_message_names[path + "wsdl dataset/" +
                                              filename]
            mt = cluster_centre_message_types[path + "wsdl dataset/" +
                                              filename]
            tu = cluster_centre_types_used[path + "wsdl dataset/" + filename]
            scores = []
            for centre in cluster_centres:
                bmn = cluster_centre_message_names[centre]
                bmt = cluster_centre_message_types[centre]
                btu = cluster_centre_types_used[centre]
                score = 2.5 * e2.eqn2(bmn, mn) + e2.eqn2(
                    bmt, mt) + 0.5 * e2.eqn2(btu, tu)
                scores = scores + [score]
            j = scores.index(max(scores))
            f = open("Clusters/cluster" + str(j) + ".txt", "a")
            f.write(filename + "\n")
            command = command + "Clusters/" + fl
            print "Removing :", fl
            os.system(command)
def refine():
 global cluster_centres
 global cluster_centre_message_names 
 global cluster_centre_message_types
 global cluster_centre_types_used 
 filename = ''
 x=[]
 j=0
 scores = []
 score = 0
 mn = [] #Message names
 mt = [] #Message types
 tu = [] #types used
 bmn = [] #base Message names
 bmt = [] #base Message types
 btu = [] #base types used
 path = '/home/kaushik/Desktop/web services project/'
 command = ""
 f=object()
 for fl in os.listdir("/home/kaushik/Desktop/web services project/Clusters"):
    f=open("Clusters/"+fl,"r")
    x = f.readlines()    
    if len(x) == 1:
       command = "rm "
       #print x
       filename = x[0]
       filename = filename[0:len(filename)-1]
       mn = cluster_centre_message_names[path+"wsdl dataset/" + filename]    
       mt = cluster_centre_message_types[path+"wsdl dataset/" +filename]    
       tu = cluster_centre_types_used[path+"wsdl dataset/" +filename] 
       scores = []
       for centre in cluster_centres :
         bmn = cluster_centre_message_names[centre]    
         bmt = cluster_centre_message_types[centre]
         btu = cluster_centre_types_used[centre]
         score = 2.5*e2.eqn2(bmn,mn) + e2.eqn2(bmt,mt) + 0.5*e2.eqn2(btu,tu) 
         scores = scores + [score]       
       j = scores.index(max(scores))
       f=open("Clusters/cluster"+str(j)+".txt","a")
       f.write(filename+"\n")
       command = command +  "Clusters/" + fl
       print "Removing :" , fl
       os.system(command)
def hierarchical_clustering():  # does hierarchical clustering
    g = nx.DiGraph()
    list_of_nodes = os.listdir(os.getcwd() + "/Clusters")
    no_of_hier_nodes = 0
    score = 0
    g.add_node("Entire Dataset", size=2000)
    for items in list_of_nodes:

        g.add_node(str(no_of_hier_nodes), size=2000)
        g.add_edge("Entire Dataset", str(no_of_hier_nodes))
        node_sizes["Entire Dataset"] = 2000
        if items not in g.nodes():
            g.add_node(cluster_names[items], size=2000)
            node_sizes[cluster_names[items]] = 2000
            g.add_edge(str(no_of_hier_nodes), cluster_names[items])

        for node in list_of_nodes:
            if node != items and node not in g.nodes():
                score = e2.eqn2(cluster_features[items],
                                cluster_features[node])
                if score > 0.3:
                    g.add_node(cluster_names[node], size=2000)
                    node_sizes[cluster_names[node]] = 2000
                    g.add_edge(str(no_of_hier_nodes), cluster_names[node])
                    list_of_nodes.remove(node)

        list_of_nodes.remove(items)
        no_of_hier_nodes = no_of_hier_nodes + 1
    T = nx.bfs_tree(g, source="Entire Dataset")
    nx.draw(g)

    pylab.show()
    g = T
    nx.set_node_attributes(g, "size", node_sizes)
    t = json_graph.tree_data(T, root="Entire Dataset")
    x = str(t)
    x = re.sub("\'id\'", "\'name\'", x)
    x = re.sub("\'", "\"", x)
    #x=re.sub("\'","\"",x)
    #x=re.sub("\"name\"[\:] \"(.)*\"","\"name\"[:] \"(.)*\" \"size\": 2000",x)
    fl = open("graph.json", "w")
    fl.write(x)
    fl.close()
    print x
def hierarchical_clustering(): # does hierarchical clustering
 g=nx.DiGraph() 
 list_of_nodes = os.listdir(os.getcwd() + "/Clusters")
 no_of_hier_nodes = 0
 score = 0 
 g.add_node("Entire Dataset",size=2000)
 for items in list_of_nodes :
   
   g.add_node(str(no_of_hier_nodes),size=2000)
   g.add_edge("Entire Dataset",str(no_of_hier_nodes))
   node_sizes["Entire Dataset"] = 2000
   if items not in g.nodes() :
       g.add_node(cluster_names[items],size=2000)  
       node_sizes[cluster_names[items]] = 2000
       g.add_edge(str(no_of_hier_nodes),cluster_names[items])
       
   for node in list_of_nodes :
      if node != items and node not in g.nodes():
         score = e2.eqn2(cluster_features[items],cluster_features[node])
         if score > 0.3 :
            g.add_node(cluster_names[node],size=2000) 
            node_sizes[cluster_names[node]] = 2000
            g.add_edge(str(no_of_hier_nodes),cluster_names[node])
            list_of_nodes.remove(node)
       
   list_of_nodes.remove(items)
   no_of_hier_nodes = no_of_hier_nodes + 1
 T=nx.bfs_tree(g,source="Entire Dataset")
 nx.draw(g)

 pylab.show()
 g=T
 nx.set_node_attributes(g,"size",node_sizes)
 t=json_graph.tree_data(T,root="Entire Dataset")
 x=str(t)
 x=re.sub("\'id\'","\'name\'",x)
 x=re.sub("\'","\"",x)
 #x=re.sub("\'","\"",x)
 #x=re.sub("\"name\"[\:] \"(.)*\"","\"name\"[:] \"(.)*\" \"size\": 2000",x)
 fl = open("graph.json","w")
 fl.write(x)
 fl.close()
 print x
Esempio n. 6
0
def hierarchical_clustering():  # does hierarchical clustering
    g = nx.Graph()
    g1 = nx.Graph()
    g.add_node("Entire Dataset")
    if "Hierarchies.csv" in os.listdir('.'):
        os.system("rm Hierarchies.csv")
    hierarchies = open("Hierarchies.csv", "a")
    list_of_clusters = os.listdir(
        '/home/kaushik/Desktop/web services project/Clusters')
    similar_clusters = []
    name = ''
    all_clusters = ''
    fdist = object()
    no_of_hier = 0
    for x in list_of_clusters:
        similar_clusters = []
        similar_clusters = similar_clusters + [x]
        list_of_clusters.remove(x)
        name = ''
        all_clusters = ''
        if len(list_of_clusters) != 0:
            for y in list_of_clusters:
                if e2.eqn2(cluster_features[x], cluster_features[y]) > 0.3:
                    similar_clusters = similar_clusters + [y]
                    list_of_clusters.remove(y)
            no_of_hier = no_of_hier + 1
        g.add_nodes_from(similar_clusters)
        g.add_node(no_of_hier)

        for node in similar_clusters:
            g.add_edge(no_of_hier, node)
            all_clusters = all_clusters + node + " "
        hierarchies.write(str(no_of_hier) + "," + all_clusters + "\n")
        g.add_edge(no_of_hier, "Entire Dataset")
        #list_of_clusters.remove(x)

        print similar_clusters
    nx.draw(g)
    pylab.show()
def cluster():
 global cluster_centres
 global cluster_centre_message_names 
 global cluster_centre_message_types
 global cluster_centre_types_used 
 f=object()
 filename = ''
 counter = 0
 score = 0
 i=0
 bmn = [] #base Message names
 bmt = [] #base Message types
 btu = [] #base types used
 bns = []
 mn = [] #Message names
 mt = [] #Message types
 tu = [] #types used
 ns = []
 scores = []
 path = '/home/kaushik/Desktop/web services project/wsdl dataset'
 for fl in  os.listdir('/home/kaushik/Desktop/web services project/wsdl dataset'):
   filename = path+ "/" + fl
   counter = counter + 1 
   #if re.search('str',str(type(wa.findName(filename)))):
   print counter , " : " ,  len(cluster_centres)
   if counter == 1:
       cluster_centres = cluster_centres + [filename] 
       cluster_centre_message_names[filename]=wa.findMessageNames(filename)  
       cluster_centre_message_types[filename]=wa.findMessageTypes(filename)  
       cluster_centre_types_used[filename]=wa.getTypes(filename)     
       f=open("Clusters/cluster"+str(len(cluster_centres))+".txt","a")
       f.write(fl+"\n") 
   else :
       mn = wa.findMessageNames(filename)
       mt = wa.findMessageTypes(filename)    
       tu = wa.getTypes(filename) 
       #ns = wa.breakName(wa.findName(filename))
       i=0
       scores = []
       for centre in cluster_centres :
        
         if filename in cluster_centres :
            break
         bmn = cluster_centre_message_names[centre]    
         bmt = cluster_centre_message_types[centre]
         btu = cluster_centre_types_used[centre]
         #bns = wa.breakName(wa.findName(centre))
         score = 2.5*e2.eqn2(bmn,mn) + e2.eqn2(bmt,mt) + 0.5*e2.eqn2(btu,tu) 
         #print score
         scores = scores + [score]
       if len(scores) > 0 and max(scores) > 1.1 :
         i=scores.index(max(scores))         
         f=open("Clusters/cluster"+str(i)+".txt","a")
         f.write(fl+"\n")                            
       else :            
            cluster_centres = cluster_centres + [filename] 
            cluster_centre_message_names[filename]=wa.findMessageNames(filename)  
            cluster_centre_message_types[filename]=wa.findMessageTypes(filename)  
            cluster_centre_types_used[filename]=wa.getTypes(filename)     
            f=open("Clusters/cluster"+str(len(cluster_centres))+".txt","a")
            f.write(fl+"\n")  
  
 
 print len(cluster_centres)
def cluster():
    global cluster_centres
    global cluster_centre_message_names
    global cluster_centre_message_types
    global cluster_centre_types_used
    f = object()
    filename = ''
    counter = 0
    score = 0
    i = 0
    bmn = []  #base Message names
    bmt = []  #base Message types
    btu = []  #base types used
    bns = []
    mn = []  #Message names
    mt = []  #Message types
    tu = []  #types used
    ns = []
    scores = []
    path = '/home/kaushik/Desktop/web services project/wsdl dataset'
    for fl in os.listdir(
            '/home/kaushik/Desktop/web services project/wsdl dataset'):
        filename = path + "/" + fl
        counter = counter + 1
        if re.search('str', str(type(wa.findName(filename)))):
            print counter, " : ", len(cluster_centres)
            if counter == 1:
                cluster_centres = cluster_centres + [filename]
                cluster_centre_message_names[filename] = wa.findMessageNames(
                    filename)
                cluster_centre_message_types[filename] = wa.findMessageTypes(
                    filename)
                cluster_centre_types_used[filename] = wa.getTypes(filename)
                f = open(
                    "Clusters/cluster" + str(len(cluster_centres)) + ".txt",
                    "a")
                f.write(fl + "\n")
            else:
                mn = wa.findMessageNames(filename)
                mt = wa.findMessageTypes(filename)
                tu = wa.getTypes(filename)
                ns = wa.breakName(wa.findName(filename))
                i = 0
                scores = []
                for centre in cluster_centres:

                    if filename in cluster_centres:
                        break
                    bmn = cluster_centre_message_names[centre]
                    bmt = cluster_centre_message_types[centre]
                    btu = cluster_centre_types_used[centre]
                    #bns = wa.breakName(wa.findName(centre))
                    score = 2.5 * e2.eqn2(bmn, mn) + e2.eqn2(
                        bmt, mt) + 0.5 * e2.eqn2(btu, tu)
                    #print score
                    scores = scores + [score]
                if max(scores) > 0.9:
                    i = scores.index(max(scores))
                    f = open("Clusters/cluster" + str(i) + ".txt", "a")
                    f.write(fl + "\n")
                else:
                    cluster_centres = cluster_centres + [filename]
                    cluster_centre_message_names[
                        filename] = wa.findMessageNames(filename)
                    cluster_centre_message_types[
                        filename] = wa.findMessageTypes(filename)
                    cluster_centre_types_used[filename] = wa.getTypes(filename)
                    f = open(
                        "Clusters/cluster" + str(len(cluster_centres)) +
                        ".txt", "a")
                    f.write(fl + "\n")

    print len(cluster_centres)