def create_occurence_file(self, filter=""): print("Construction de la matrice d'occurence par cluster\n") code = "" rc = self.getOccurenceCluster(self.models, filter) for r in range(len(rc)): tools.progress(r,len(rc)-1) code = code + "\n<h1>Cluster présent dans " + str( round(100 * rc["Occurence"][r])) + "% des algos</h1>" c = rc["Cluster"][r] code = code + c.print(self.ref_model.data, self.col_name) + "\n" code = code + "\n présent dans " + ",".join(rc["Model"][r]) + "\n" #print(tools.create_html("occurences", code, "http://f80.fr/cnrs")) dfOccurences = pd.DataFrame( data={"Cluster": rc["Cluster"], "Composition":rc["Composition"],"Model": rc["Model"], "Algos": rc["Algos"], "Occurence": rc["Occurence"]}) l_items = list(set(self.ref_model.data[self.col_name].get_values())) for item in l_items: dfOccurences[item] = [0] * len(rc) print("\nTraitement de la mesure "+item) for i in range(len(rc)): tools.progress(i, len(rc)) c = dfOccurences["Cluster"][i] dfOccurences[item][i] = c.labels.count(item) return dfOccurences
def pca_totrace(mod, ref_cluster, pca_offset=0): labels = mod.names() mesures = tools.normalize(mod.mesures()) if mod.dimensions != 3: pca: decomp.pca.PCA = decomp.pca.PCA(n_components=3 + pca_offset) pca.fit(mesures) newdata = pca.transform(mesures) else: newdata = mesures.values li_data: list = [] facets = [] i = 0 for c in mod.clusters: i = i + 1 tools.progress(i, len(mod.clusters), "Préparation des clusters pour rendu 3d") if len(c.clusters_distances) > 0: distances: pd.DataFrame = pd.DataFrame.from_dict( c.clusters_distances, orient="index", columns=["distance", "p1", "p2"]) distances = distances.sort_values("distance") distances = distances[0:10] distances = distances.transpose() ss = distances.to_json() else: ss = "{}" #distances.sort_index(ascending=False) facets.append(c.get_3dhull(newdata, pca_offset)) for k in range(len(c.index)): ind = c.index[k] x = newdata[ind, pca_offset] y = newdata[ind, pca_offset + 1] z = newdata[ind, pca_offset + 2] sp = { 'index': ind, 'x': x, 'y': y, 'z': z, 'style': c.color, 'label': labels[ind], 'name': labels[ind], 'size': 1, 'form': 'sphere', 'cluster': c.name, 'ref_cluster': ref_cluster[ind], 'cluster_distance': ss } li_data.append(sp) return li_data, facets
def create_trace(self, url="http://f80.fr/cnrs", name="best_",limit=10000,withPerf=False,): print("\nTracés 3D et 2D des résultats.") name = name.replace(" ", "_") code = "Calcul du " + str(datetime.datetime.now()) + "\n\n" for i in range(0, min(limit,len(self.models))): tools.progress(i,min(limit,len(self.models))) code = code + "\nPosition " + str(i + 1) + "<br>" code = code + self.models[i].trace("./saved", name + str(i), url) if withPerf:code = code + self.models[i].print_perfs() tools.create_html("index_" + name, code, url)
def init_distance_cluster(self): m = self.mesures().values i = 0 for c1 in self.clusters: tools.progress(i, len(self.clusters)) i = i + 1 for c2 in self.clusters: if c1 != c2: if c1.clusters_distances.get(c2.name) is None: d = list(c1.distance_min(c2, m)) c1.clusters_distances[c2.name] = d
def initByDistance(self, seuil=1): self.init_distances() l_edges=[] for i in range(0,len(self.distances)): tools.progress(i,len(self.distances),"Construction du graphe") for j in range(0, len(self.distances)): if self.distances[i,j]<seuil: l_edges.append([i,j]) self.graph.add_edges_from(l_edges) d:dict=dict(zip(range(0,len(self.data)),self.data[self.name_col])) nx.set_node_attributes(self.graph,d , "label") nx.set_node_attributes(self.graph, self.data[self.measures_col],self.measures_col)
def trace_artefact_GL(mod, id="", title="", ref_model=None, pca_offset=0, autorotate=False, add_property=[]): properties_dict: dict = create_dict_for_properties(mod.data, add_property) li_data, facets = pca_totrace(mod, mod.data['ref_cluster'], pca_offset=pca_offset) if len(add_property) > 0: for i in range(0, len(li_data)): tools.progress(i, len(li_data), "Ajout des propriétés") row = li_data[i]["index"] d: dict = properties_dict[row] li_data[i] = ({**li_data[i], **d}) if ref_model is None or ref_model.clusters == mod.clusters: facets_ref = [] else: tmp_li_data, facets_ref = pca_totrace(ref_model, ref_model.data['ref_cluster'], pca_offset=pca_offset) d = pd.concat([mod.data.ix[:, 0], mod.mesures()], axis=1, sort=False) toList = [] for line in d.values: toList.append(list(line)) code = render_template( "modele.html", title=title, name_zone="zone" + id, datas=li_data, components=list(mod.mesures().columns), autorotate=str(autorotate).lower(), data_source=toList, facets_ref=facets_ref, facets=facets, edges=[], url_to_render="/static/rendering/render.html?offset=" + str(pca_offset)) return code
def __init__(self,data=None,url:str="",remote_addr:str="",algo_loc:str=""): self.clusters.clear() if draw.colors is None or len(draw.colors) < 2: draw.colors = draw.init_colors(200) if len(url)>0: tools.progress(0,100,"Chargement du graphe") url=tools.getUrlForFile(url,remote_addr) if not self.load(url,algo_loc): if not self.graph is None: self.save() self.graph = nx.convert_node_labels_to_integers(self.graph, label_attribute="name") if not self.graph is None: tools.progress(90,100,"Préparation") self.data: pd.DataFrame = pd.DataFrame(index=list(range(0,len(self.graph.nodes)))) self.data["name"]=list(self.graph.nodes.keys()) self.dimensions = 3 self.name_col = "name" if not data is None: super().__init__(data=data) self.graph=nx.Graph()
def init_metrics(self,showProgress=False): rc="" self.metrics: pd.DataFrame = pd.DataFrame() print("Calcul des métriques") print("\nPremière passe") true_labels=self.ref_model.cluster_toarray() for i in range(len(self.models)): if showProgress:tools.progress(i, len(self.models)) m:algo.model=self.models[i] m.init_metrics(true_labels) print("Tri des "+str(len(self.models))+" modeles") self.models.sort(key=lambda x: x.score, reverse=True) print("\n2eme passe") for i in range(len(self.models)): if showProgress:tools.progress(i, len(self.models)) m = self.models[i] self.metrics = self.metrics.append(m.toDataframe(true_labels)) rc=rc+m.print_perfs() return rc
def findClusters(self,prefixe="cl_",method="gn",k=5,iter=15): if not self.load_cluster(self.url+"_"+method+str(k)+str(iter)): tools.progress(0, 100, "Recherche des communautés avec "+method) #Initialisation a un cluster unique comm=[set(range(0,len(self.graph.nodes)))] if method.startswith("gn"): tmp=nx.algorithms.community.girvan_newman(self.graph) comm=tuple(sorted(c) for c in next(tmp)) if method.startswith("lab"): comm=nx.algorithms.community.label_propagation_communities(self.graph) if method.startswith("mod"): comm=nx.algorithms.community.greedy_modularity_communities(self.graph) if method.startswith("async"): try: comm = nx.algorithms.community.asyn_fluidc(self.graph,k=k,max_iter=iter) except: tools.progress(100,100,"Impossible d'exécuter async_fluid") i=0 for c in comm: cl=cluster(prefixe+str(i),index=list(c),color=draw.colors[i % len(draw.colors)]) i=i+1 tools.progress(i, 100, "Fabrication des clusters") self.clusters.append(cl) tools.progress(100, 100, "Clustering terminé") self.save_cluster(self.url+"_"+method) else: tools.progress(100,100,"Chargement des clusters")
def clusters_from_labels(self, labels: np.ndarray, colors, name="cl_"): #offset=min(old_labels)+10000 #labels=[x+offset for x in old_labels] d = dict() i = 0 for l in labels: if not l in d.keys(): d[l] = [] d[l].append(i) i = i + 1 i = 0 for k in d.keys(): i = i + 1 tools.progress(i, len(d), "Construction des clusters") color = colors[i % len(colors)] if k != -1: c: cluster = cluster(name + str(i), index=d[k], color=color, pos=i) c.findBestName(self.data[self.name_col], "cl" + str(i) + "_") self.clusters.append(c)
def load(self,url,algo_loc=""): self.url=bytes(base64.encodebytes(bytes(url+algo_loc,encoding="utf-8"))).hex() if os.path.exists("./clustering/"+self.url+".gpickle"): tools.progress(50,100,"Chargement depuis le cache") self.graph = nx.read_gpickle("./clustering/"+self.url+".gpickle") return True else: url=tools.dezip(url) if ".gml" in url or ".graphml" in url : tools.progress(50, 100, "Chargement du fichier au format GML") try: self.graph =nx.read_gml(url) except: try: self.graph=nx.read_graphml(url) except: return False if ".gexf" in url or ".gephi" in url: try: self.graph = nx.read_gexf(url) except: print("Impossible de lire "+url) if self.graph is None: tools.progress(50, 100, "Chargement depuis la matrice de distance") try: self.data: pd.DataFrame = tools.get_data_from_url(url,"") except: pass if not self.data is None: self.create_graph_from_dataframe() return True return False
def __init__(self,data:pd.DataFrame,no_metric=False,format:dict=dict()): if draw.colors is None or len(draw.colors) < 2: draw.colors = draw.init_colors(200) self.data = data #Réglage des parametres if not "name" in format: format["name"]=[data.columns[0]]# Le libellé des mesures est pris sur la premiere colonne if not "measures" in format: format["measures"]=data.columns[range(1,len(data.columns.values))] if not "properties" in format: format["properties"]=[] # if not "properties" in format: # #Par defaut les propriétées sont entre les mesures et l'index # if int(format["name"])+1<min(list(format["measures"]))-1: # format["properties"]=data.columns[list(range(format["name"]+1,min(format["measures"])-1))] # else: # format["properties"]=[] self.col_name = format["name"][0] self.col_measures=format["measures"] self.col_properties = format["properties"] i = 0 for c in data[format["measures"]]: tools.progress(i, len(format["measures"]), "Conversion des chaines de caractères de " + c) i = i + 1 if data[c].dtype == object and len(data[c]) > 0: l_values=set(data[c]) items=dict(zip(l_values,[0]*len(l_values))) ref=data[c][1] if tools.getComplexity(data[c])>90: data[c]=tools.tokenize(data[data.columns[i]]) else: for item in items.keys(): d=stringdist.levenshtein(item, ref) items[item]=d if len(items)<100: data[c]=data[c].replace(items.keys(),items.values()) else: l=[] for k in data[c]: l.append(items[k]) data[c]=l if not "cluster" in list(self.data.columns): if not "cluster" in format: format["cluster"]=[self.col_name] self.data["ref_cluster"] = self.create_ref_cluster_from_name(self.data, format["cluster"][0]) self.dimensions = len(format["measures"]) # Les composantes sont les colonnes suivantes self.ref_model: algo.model = self.init_reference_model() if not no_metric: self.ref_model.init_metrics(self.ref_model.cluster_toarray())
def save(self,path=""): tools.progress(0,100,"Enregistrement du fichier dans le cache") if len(path)==0:path="./clustering/"+self.url+".gpickle" if not path.endswith(".gpickle"):path=path+".gpickle" nx.write_gpickle(self.graph,path)
def node_treatments(self): G=self.graph tools.progress(0,100,"Degree de centralité") if len(nx.get_node_attributes(G,"centrality"))==0: nx.set_node_attributes(G,nx.degree_centrality(G),"centrality") tools.progress(20, 100, "Degree de betweeness") if len(nx.get_node_attributes(G, "betweenness")) == 0: nx.set_node_attributes(G, nx.betweenness_centrality(G), "betweenness") tools.progress(40, 100, "Degree de closeness") if len(nx.get_node_attributes(G, "closeness")) == 0: nx.set_node_attributes(G, nx.closeness_centrality(G), "closeness") tools.progress(60, 100, "Page rank") try: if len(nx.get_node_attributes(G, "pagerank")) == 0: nx.set_node_attributes(G, nx.pagerank(G), "pagerank") except: pass tools.progress(80, 100, "Hub and autorities") try: if len(nx.get_node_attributes(G, "hub")) == 0: hub, aut = nx.hits(G) nx.set_node_attributes(G, hub, "hub") nx.set_node_attributes(G, aut, "autority") except: pass #tools.progress(90, 100, "Excentricity") #nx.set_node_attributes(G, nx.eccentricity(G), "eccentricity") self.node_treatment=True tools.progress(100, 100, "Fin des traitements")
def init_metrics(self, labels_true): mes = self.mesures() i = 0 for c in self.clusters: tools.progress(i, len(self.clusters), "Initialisation des metrics des clusters") c.init_metrics(mes) i = i + 1 if len(self.clusters) > 2: labels = self.cluster_toarray() tools.progress(10, 100, "Score de silhouete") self.silhouette_score = metrics.silhouette_score( self.mesures(), labels) tools.progress(40, 100, "Rand Index") self.rand_index = metrics.adjusted_rand_score(labels_true, labels) #self.self.adjusted_mutual_info_score=metrics.self.adjusted_mutual_info_score(labels_true,labels) tools.progress(50, 100, "Homogénéité") self.homogeneity_score = metrics.homogeneity_score( labels_true, labels) tools.progress(60, 100, "Completeness") self.completeness_score = metrics.completeness_score( labels_true, labels) tools.progress(70, 100, "V-mesure") self.v_measure_score = metrics.v_measure_score(labels_true, labels) self.score = ( (self.silhouette_score + 1) + (self.rand_index + 1) * 1.5 + self.v_measure_score) / 6 self.score = round(self.score * 20 * 100) / 100 tools.progress(100, 100, "Calcul metriques terminé") if len(self.clusters) < 3: self.init_distance_cluster() else: self.silhouette_score = 0 self.score = 0 self.rand_index = 0 self.homogeneity_score = 0 self.completeness_score = 0 self.v_measure_score = 0 return self.print_perfs()