def evaluate(datos, ponderaciones, silencios, carga_computacional=1,tipo_clustering="KMEANS", ncluster = 3, metrica = "SIL"): ''' Dado un clasificador y un conjunto de train y test devuelve su tasa de acierto en test. ''' global light_segs fecha = False if len(ponderaciones) == 0: return 0.0 ligero = np.sign(carga_computacional) < 0 carga_computacional = abs(carga_computacional) if carga_computacional < 1: datos = datos.sample(int(datos.shape[0] * carga_computacional)) datos = datos.sort_index() carga_computacional = 1 fecha = True if carga_computacional>=1: if not ligero: segmentos, _ = sg.segmentate_data_frame(df=datos, montecarlo=1, min_size=4, silence=silencios, vector_importancias=ponderaciones, verbose=False) mean = len(sg.ultra_light_segmentation(datos, fecha=fecha))/2 else: if len(light_segs) == 0: light_segs = sg.ultra_light_segmentation(datos, fecha=fecha) segmentos = light_segs mean = len(segmentos)/2 #mean = int(datos.shape[0]/100) if carga_computacional == 1: segmentados = cl.apply_segmentation(datos, segmentos, silencios, ponderaciones, fecha) if segmentados.shape[0] <= 6: return 0.0 else: std = np.sqrt(mean) nsegs=[] for i in range(segmentados.shape[0]): nsegs.append([i,i+1]) segmentos = sg.join_segments(data=segmentados, o_segments=nsegs, distance=sg.interpretable_distance, threshold=0.5, minimum_size=1,silence=silencios, vector_importancias=ponderaciones)[0] segmentados = cl.apply_segmentation(segmentados, segmentos, silencios, ponderaciones, fecha) return cl.hopkins_statistic(cl.filter_numerical(segmentados),m=int(segmentados.shape[0]*0.5))* normal_correction(mean, std, len(segmentados)) elif carga_computacional==2: if tipo_clustering == "DTW": segmentados = sg.get_segments(datos, segmentos) asignaciones = pam.kmedoids(segmentados, n_clus = ncluster) segments_df = cl.apply_clustering(datos, segmentos, asignaciones[1], asignaciones[2]) elif tipo_clustering == "KMEANS": segments_df = cl.full_clustering(datos, segmentos, n_clus = ncluster, silencio=silencios, pesos = ponderaciones, normalizar=False) if metrica == "SIL": return cl.sil_metric(segments_df[0]) elif metrica == "ENTROPY": return entropy_metric(segments_df)
def clara(data, k): ''' CLARA implemenation 1. For i = 1 to 5, repeat the following steps: 2. Draw a sample of 40 + 2k objects randomly from the entire data set,2 and call Algorithm PAM to find k medoids of the sample. 3. For each object Oj in the entire data set, determine which of the k medoids is the most similar to Oj. 4. Calculate the average dissimilarity of the clustering obtained in the previous step. If this value is less than the current minimum, use this value as the current minimum, and retain the k medoids found in Step 2 as the best set of medoids obtained so far. 5. Return to Step 1 to start the next iteration. ''' size = len(data) min_avg_cost = float('inf') best_choice = [] best_res = {} for i in range(claraLoopNum): # Construct the sampling subset sampling_idx = random.sample([i for i in range(size)], (40+k*2)) sampling_data = [] for idx in sampling_idx: sampling_data.append(data[idx]) # Run kmedoids for the sampling pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k) if debugEnabled == True: print('pre_cost: ', pre_cost) print('pre_choice: ', pre_choice) print('pre_medioids: ', pre_medoids) # Convert the pre_choice from sampling_data to the whole data pre_choice2 = [] for idx in pre_choice: idx2 = data.index(sampling_data[idx]) pre_choice2.append(idx2) if debugEnabled == True: print('pre_choice2: ', pre_choice2) # Clustering for all data set tmp_avg_cost, tmp_medoids = averageCost(data, 0, pre_choice2) if debugEnabled == True: print('tmp_avg_cost: ', tmp_avg_cost) print('tmp_medoids: ', tmp_medoids) # Update the best if tmp_avg_cost <= min_avg_cost: min_avg_cost = tmp_avg_cost best_choice = list(pre_choice2) best_res = dict(tmp_medoids) return(min_avg_cost, best_choice, best_res)
def full_clustering(X, segmentos, n_clus=3, mode="K-Means", silencio=[], pesos=None, normalizar=True): ''' Given a data frame and their segmentation (segments indexes) this function applies clustering for each segment and returns the dataframe with the representation of the segments. ''' X.drop('cluster', axis=1, errors='ignore', inplace=True) if pesos is None: pesos = [1] * (len(list(X._get_numeric_data())) - len(silencio) * 2) if mode == "K-Means": segments_df = apply_segmentation(X, segmentos, silencio, pesos) if normalizar: segments_df = minmax_norm(segments_df) fit = clustering(segments_df, n_clus) segments_df['cluster'] = fit.labels_ segments_df['cluster'] = segments_df['cluster'].astype(str) else: X_num = filter_numerical(X) if normalizar: X_num = minmax_norm(X_num) X_num = filter_silence(X_num, silencio) X_np = np.array(X_num) X_segments = get_segments_nparray(X_np, segmentos) _, best_choice, best_res = pam.kmedoids(X_segments, n_clus) fit = FTWFit(best_choice, filter_numerical(X)) segments_df = apply_segmentation(X, segmentos, silencio, pesos) segments_df = add_clustering_segments(segments_df, best_choice, best_res) if normalizar: segments_df = minmax_norm(segments_df) return segments_df, fit
def clara(data, k, COST=0, distDictClara={}, simDictClara={}, affinities={}, bagSize=BagSize, namedPoints=True, degreeSelection=degreeSelection, claraLoopNum=claraLoopNum, noIsolates=True, saveAllResults=False, acceleration=0, take_all_nodes=False): ''' CLARA implemenation 1. For i = 1 to 5, repeat the following steps: 2. Draw a sample of 40 + 2k objects randomly from the entire data set,2 and call Algorithm PAM to find k medoids of the sample. 3. For each object Oj in the entire data set, determine which of the k medoids is the most similar to Oj. 4. Calculate the average dissimilarity of the clustering obtained in the previous step. If this value is less than the current minimum, use this value as the current minimum, and retain the k medoids found in Step 2 as the best set of medoids obtained so far. 5. Return to Step 1 to start the next iteration. ''' size = len(data) min_cost = float('inf') best_choice = [] best_res = {} sampling_idx = [] cost_list = [] isolates = [] print "clara COST: ", COST print "take all nodes: ", take_all_nodes if take_all_nodes: bagSize = len(affinities) if saveAllResults: allResults=[] def IDtoname(data, best_med): best_med_names = {} best_choice_names = [] for medID in best_med.keys(): #best_choice_names.append(data[medID]) best_med_names[data[medID]] = [] for pointID in best_med[medID]: best_med_names[data[medID]].append(data[pointID]) #best_choice = best_choice_names best_med = best_med_names return best_med # if degreeSelection == True, then 4*k nodes with highest degree will be # sampled anyway (the rest of nodes for the subsample (the number is Bag_Size) # are sampled as usual) if degreeSelection: degree = {} sampling_data_permanent = [] sampling_data = [] def getDegree(item): return item[1] # Compute sorted list of node degrees for i in list(data): # in order to not run out of range # remove singletons (treat them as isolates) # first compute weighted degree # TODO: find out the reasons for it try: degree[i] = sum(affinities[i].values()) except KeyError: print "not in affinities: ", i degree[i] = 0 #print "degree of %s is %f" % (i, degree[i]) if degree[i]==0.0 and noIsolates == True: #print "~~~~~~~~~~EUREKA!!!!~~~~~~~~~~" isolates.append(i) data.remove(i) # Then remove paired nodes the same way as singletons. # As algorithm doesn't require absolute connectivity, # we deliberately leave components with 3+ nodes, hoping # that they will appear to be separate clusters after # averaging multiple randomized clara's results with kopt.py. for i in list(data): try: if len(affinities[i])==1 and len(affinities[affinities[i].keys()[0]])==1: isolates.append(i) data.remove(i) except: print "LOOK OUT: ", len(affinities[i]), affinities[i] # list rather than dict because dict cannot be ordered degree = sorted(degree.items(), key=getDegree, reverse=True) print "degrees obtained" # Obtain the bag of most prominent nodes for Clara clustering limit = k*4 if k*4 > len(data): print "used up all data points for degree selection: %d points instead of 4k = %d" %(len(data), 4*k) limit = len(data) for point in degree: if len(sampling_data_permanent) >= limit: break sampling_data_permanent.append(point[0]) therest = [point for point in data if point not in sampling_data] print "len(therest): ", len(therest) print "len(data): ", len(data) print "bagSize , bagSize - k*4: ", bagSize, bagSize-k*4 iterspot = 0 for i in range(claraLoopNum): iterspot += 1 print "\n\nRUN No.", iterspot # Construct the sampling subset if degreeSelection == False: sampling_data = [] sampling_idx = random.sample([i for i in range(size)], bagSize) for idx in sampling_idx: sampling_data.append(data[idx]) else: sampling_data = list(sampling_data_permanent) sampling_idx = random.sample([i for i in range(len(therest))], bagSize-k*4) for idx in sampling_idx: sampling_data.append(therest[idx]) print "all nodes/points: ", len(sampling_data) print "permanently selected nodes/points: ", len(sampling_data_permanent) # Run kmedoids for the sampling pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k, COST, distDictKM=distDictClara, simDictKM=simDictClara, namedPoints=False, acceleration=acceleration) if debugEnabled == True: print('pre_cost: ', pre_cost) print('pre_choice: ', pre_choice) print('pre_medoids: ', pre_medoids) # pre_medoids are not too long to display # Convert the pre_choice from sampling_data to the whole data pre_choice2 = [] for idx in pre_choice: #print sampling_data[idx] idx2 = data.index(sampling_data[idx]) pre_choice2.append(idx2) if debugEnabled == True: print('pre_choice2: ', pre_choice2) # Clustering for all data set tmp_cost, tmp_medoids = targetFunction(data, COST, pre_choice2, distDict=distDictClara, simDict=simDictClara, affinities=affinities) cost_list.append(tmp_cost) if debugEnabled == True: print 'tmp_cost: ', tmp_cost print 'tmp_medoids: ', 'OK' #tmp_medoids) # If the points are named, display the names (refactor it) if namedPoints: tmp_medoids = IDtoname(data, tmp_medoids) pre_choice2 = tmp_medoids.keys() # Update the best if tmp_cost <= min_cost: min_cost = tmp_cost best_choice = list(pre_choice2) best_res = dict(tmp_medoids) if saveAllResults: allResults.append(tmp_medoids) if saveAllResults: return(min_cost, best_choice, best_res, cost_list, isolates, allResults) else: return(min_cost, best_choice, best_res, cost_list, isolates)
def clara(data, k, COST=0, distDictClara={}, simDictClara={}, affinities={}, bagSize=BagSize, namedPoints=True, degreeSelection=degreeSelection, claraLoopNum=claraLoopNum, noIsolates=True, saveAllResults=False, acceleration=0, take_all_nodes=False): ''' CLARA implemenation 1. For i = 1 to 5, repeat the following steps: 2. Draw a sample of 40 + 2k objects randomly from the entire data set,2 and call Algorithm PAM to find k medoids of the sample. 3. For each object Oj in the entire data set, determine which of the k medoids is the most similar to Oj. 4. Calculate the average dissimilarity of the clustering obtained in the previous step. If this value is less than the current minimum, use this value as the current minimum, and retain the k medoids found in Step 2 as the best set of medoids obtained so far. 5. Return to Step 1 to start the next iteration. ''' size = len(data) min_cost = float('inf') best_choice = [] best_res = {} sampling_idx = [] cost_list = [] isolates = [] print "clara COST: ", COST print "take all nodes: ", take_all_nodes if take_all_nodes: bagSize = len(affinities) if saveAllResults: allResults = [] def IDtoname(data, best_med): best_med_names = {} best_choice_names = [] for medID in best_med.keys(): #best_choice_names.append(data[medID]) best_med_names[data[medID]] = [] for pointID in best_med[medID]: best_med_names[data[medID]].append(data[pointID]) #best_choice = best_choice_names best_med = best_med_names return best_med # if degreeSelection == True, then 4*k nodes with highest degree will be # sampled anyway (the rest of nodes for the subsample (the number is Bag_Size) # are sampled as usual) if degreeSelection: degree = {} sampling_data_permanent = [] sampling_data = [] def getDegree(item): return item[1] # Compute sorted list of node degrees for i in list(data): # in order to not run out of range # remove singletons (treat them as isolates) # first compute weighted degree # TODO: find out the reasons for it try: degree[i] = sum(affinities[i].values()) except KeyError: print "not in affinities: ", i degree[i] = 0 #print "degree of %s is %f" % (i, degree[i]) if degree[i] == 0.0 and noIsolates == True: #print "~~~~~~~~~~EUREKA!!!!~~~~~~~~~~" isolates.append(i) data.remove(i) # Then remove paired nodes the same way as singletons. # As algorithm doesn't require absolute connectivity, # we deliberately leave components with 3+ nodes, hoping # that they will appear to be separate clusters after # averaging multiple randomized clara's results with kopt.py. for i in list(data): try: if len(affinities[i]) == 1 and len( affinities[affinities[i].keys()[0]]) == 1: isolates.append(i) data.remove(i) except: print "LOOK OUT: ", len(affinities[i]), affinities[i] # list rather than dict because dict cannot be ordered degree = sorted(degree.items(), key=getDegree, reverse=True) print "degrees obtained" # Obtain the bag of most prominent nodes for Clara clustering limit = k * 4 if k * 4 > len(data): print "used up all data points for degree selection: %d points instead of 4k = %d" % ( len(data), 4 * k) limit = len(data) for point in degree: if len(sampling_data_permanent) >= limit: break sampling_data_permanent.append(point[0]) therest = [point for point in data if point not in sampling_data] print "len(therest): ", len(therest) print "len(data): ", len(data) print "bagSize , bagSize - k*4: ", bagSize, bagSize - k * 4 iterspot = 0 for i in range(claraLoopNum): iterspot += 1 print "\n\nRUN No.", iterspot # Construct the sampling subset if degreeSelection == False: sampling_data = [] sampling_idx = random.sample([i for i in range(size)], bagSize) for idx in sampling_idx: sampling_data.append(data[idx]) else: sampling_data = list(sampling_data_permanent) sampling_idx = random.sample([i for i in range(len(therest))], bagSize - k * 4) for idx in sampling_idx: sampling_data.append(therest[idx]) print "all nodes/points: ", len(sampling_data) print "permanently selected nodes/points: ", len( sampling_data_permanent) # Run kmedoids for the sampling pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k, COST, distDictKM=distDictClara, simDictKM=simDictClara, namedPoints=False, acceleration=acceleration) if debugEnabled == True: print('pre_cost: ', pre_cost) print('pre_choice: ', pre_choice) print('pre_medoids: ', pre_medoids ) # pre_medoids are not too long to display # Convert the pre_choice from sampling_data to the whole data pre_choice2 = [] for idx in pre_choice: #print sampling_data[idx] idx2 = data.index(sampling_data[idx]) pre_choice2.append(idx2) if debugEnabled == True: print('pre_choice2: ', pre_choice2) # Clustering for all data set tmp_cost, tmp_medoids = targetFunction(data, COST, pre_choice2, distDict=distDictClara, simDict=simDictClara, affinities=affinities) cost_list.append(tmp_cost) if debugEnabled == True: print 'tmp_cost: ', tmp_cost print 'tmp_medoids: ', 'OK' #tmp_medoids) # If the points are named, display the names (refactor it) if namedPoints: tmp_medoids = IDtoname(data, tmp_medoids) pre_choice2 = tmp_medoids.keys() # Update the best if tmp_cost <= min_cost: min_cost = tmp_cost best_choice = list(pre_choice2) best_res = dict(tmp_medoids) if saveAllResults: allResults.append(tmp_medoids) if saveAllResults: return (min_cost, best_choice, best_res, cost_list, isolates, allResults) else: return (min_cost, best_choice, best_res, cost_list, isolates)