def test_kmedoids(dtw_value, cluster_num, seed): # 声明precomputed自定义相似度计算方法 km = KMedoids(n_clusters=cluster_num, random_state=seed, metric="precomputed", init='k-medoids++', max_iter=30000) dists = dtw_value y_pred = km.fit_predict(dists) with open(r".//res//grid_pred_d" + str(cluster_num) + ".csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) index = 0 for row in y_pred: writer.writerow([row]) index += 1 with open(r".//res//grid_centroids_d" + str(cluster_num) + ".csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for yi in range(cluster_num): writer.writerow([km.medoid_indices_[yi]]) print('finish')
class ClusterPlanner: def __init__(self, items, n_poi): self.n_poi = n_poi self.n_clusters = len(items) // n_poi self.clusterer = None self.food_types = ['restaurants', 'fast_food', 'food'] self.items = [ item for item in items if item.item_type not in self.food_types ] self.restaurants = [ item for item in items if item.item_type in self.food_types ] print('items', len(self.items)) self.data = pd.DataFrame({ 'index': [i for i in range(len(self.items))], 'lat': [item.coordinate['lat'] for item in self.items], 'lon': [item.coordinate['lon'] for item in self.items], 'type': [item.item_type for item in self.items] }) def cluster_data(self): # self.clusterer = DBSCAN(eps=0.1, min_samples=self.n_poi, metric='haversine') self.clusterer = KMedoids(n_clusters=self.n_poi, metric='haversine') self.data['label'] = self.clusterer.fit_predict( self.data[['lat', 'lon']].values) # insert restaurant in the day at index def insert_restaurant(self, poi, idx, day): distances = [] if self.restaurants: for rest in self.restaurants: distances.append(get_distance(poi, rest)) index = distances.index(min(distances)) day.insert_item(self.restaurants[index], idx) self.restaurants = [ self.restaurants[l] for l in range(len(self.restaurants)) if l != index ] # plan every cluster in city then return the full plan def plan_days(self, i=0, days=[], pois=[]): if i == self.n_clusters: return days places = self.data[self.data['label'] == i] p_items = [self.items[int(p['index'])] for l, p in places.iterrows()] # plan if items > 3 if len(p_items) > 3: itinerary = plan_itinerary_LP(p_items)[:-1] days.append(Day(i, itinerary)) elif p_items: days.append(Day(i, p_items)) for k in range(0, len(p_items)): if 'hotel' == p_items[k].item_type and k != 0: days[i].swap_items(0, k) return self.plan_days(i + 1, days, pois)
def test_kmedoids_on_sparse_input(): rng = np.random.RandomState(seed) model = KMedoids(n_clusters=2, random_state=rng) row = np.array([1, 0]) col = np.array([0, 4]) data = np.array([1, 1]) X = csc_matrix((data, (row, col)), shape=(2, 5)) labels = model.fit_predict(X) assert len(labels) == 2 assert_array_equal(labels, model.labels_)
def test_callable_distance_metric(): rng = np.random.RandomState(seed) def my_metric(a, b): return np.sqrt(np.sum(np.power(a - b, 2))) model = KMedoids(random_state=rng, metric=my_metric) labels1 = model.fit_predict(X) assert len(labels1) == 100 assert_array_equal(labels1, model.labels_)
def test_kmedoids_fit_predict_transform(): rng = np.random.RandomState(seed) model = KMedoids(random_state=rng) labels1 = model.fit_predict(X) assert len(labels1) == 100 assert_array_equal(labels1, model.labels_) labels2 = model.predict(X) assert_array_equal(labels1, labels2) Xt1 = model.fit_transform(X) assert_array_equal(Xt1.shape, (100, model.n_clusters)) Xt2 = model.transform(X) assert_array_equal(Xt1, Xt2)
def run_knn(df, y): scalar = MinMaxScaler() X = scalar.fit_transform(df) range_n_clusters = np.array([2, 3, 4, 5, 6]) inertias = [] purity = [] for n_clusters in range_n_clusters: clusterer = KMedoids(n_clusters=n_clusters, metric="manhattan", init='k-medoids++', max_iter=100, random_state=1234) cluster_labels = clusterer.fit_predict(X) inertias.append(clusterer.inertia_) calculate_silhouette(X, cluster_labels, n_clusters) purity.append(purity_score(y, cluster_labels)) plot_purity(range_n_clusters, purity) plot_elbow(range_n_clusters, inertias)
def k_medoids_draw(self, self_fig): #计算k_medoids # self.k_model_draw(KMeans) class_num = 3 if self.ui.class_num_lineEdit.text() == "" else int( self.ui.class_num_lineEdit.text()) max_iter_num = 300 if self.ui.lineEdit_max_iter.text() == "" else int( self.ui.lineEdit_max_iter.text()) random_state = 170 x_varied = self.data.x_varied if x_varied.shape[1] == 2: #2维数据 fig, ax1 = plt.subplots(figsize=(8, 5)) elif x_varied.shape[1] == 3: #3维数据 fig = plt.figure() ax1 = fig.add_subplot(111, projection='3d') init_string = 'random' if self.ui.radioButton_kmedoids_random.isChecked( ) else 'heuristic' if self.ui.radioButton_kmedoids_heuristic.isChecked( ) else 'k-medoids++' # y_pred = KMedoids(n_clusters=class_num, random_state=random_state).fit_predict(x_varied) metric_string = self.ui.comboBox_kMedoids.currentText() print(metric_string) KMedoids_model = KMedoids(init=init_string, n_clusters=class_num, random_state=random_state, max_iter=max_iter_num, metric=metric_string) y_pred = KMedoids_model.fit_predict(self.data.x_varied) ax1.scatter(x_varied[:, 0], x_varied[:, 1], c=y_pred) global centroids centroids = None if self.ui.checkBox_Voronoi.isChecked(): Vor_Dia(x_varied, KMedoids_model) self_fig(fig) self.evaluate(KMedoids_model, y_pred) self.add_datay_table(y_pred)
optimalClusterClasses( pcaXCD4DataSet_transformed) # Оптимальное количество - 5 кластеров # k-means km = KMeans(n_clusters=3) yKM = km.fit_predict(pcaXCD4DataSet_transformed) clusterVisualize(yKM, pcaXCD4DataSet_transformed, km) print(homogeneity_completeness_v_measure(YCD4DataSet, yKM)) print(homogeneity_score(YCD4DataSet, yKM)) print(completeness_score(YCD4DataSet, yKM)) print(v_measure_score(YCD4DataSet, yKM)) # KMedoids - Неиерархический, итеративный метод kMedoids = KMedoids(n_clusters=3, metric='euclidean') yminiKM = kMedoids.fit_predict(X=pcaXCD4DataSet_transformed) clusterVisualize(yminiKM, pcaXCD4DataSet_transformed, kMedoids) print(homogeneity_completeness_v_measure(YCD4DataSet, yminiKM)) print(homogeneity_score(YCD4DataSet, yminiKM)) print(completeness_score(YCD4DataSet, yminiKM)) print(v_measure_score(YCD4DataSet, yminiKM)) kMedoids = KMedoids(n_clusters=3, metric='manhattan') yminiKM = kMedoids.fit_predict(X=pcaXCD4DataSet_transformed) clusterVisualize(yminiKM, pcaXCD4DataSet_transformed, kMedoids) print(homogeneity_completeness_v_measure(YCD4DataSet, yminiKM)) print(homogeneity_score(YCD4DataSet, yminiKM)) print(completeness_score(YCD4DataSet, yminiKM)) print(v_measure_score(YCD4DataSet, yminiKM)) # AgglomerativeClustering - Иерархический агломеративный метод
def kmedoids(self, filtered_df): cluster = KMedoids(n_clusters=self.clust_num, random_state=self.random_state)#.fit(X) res_cluster = cluster.fit_predict(filtered_df) return res_cluster
def kmedoid_clusters(): editable_data_path = os.path.join(sys.path[0], 'editable_values.csv') editable_data = pd.read_csv(editable_data_path, header=None, index_col=0, squeeze=True).to_dict()[1] city = editable_data['city'] save_path = os.path.join(sys.path[0], 'Scenario Generation', city) representative_days_path = os.path.join(save_path, 'Representative days') if not os.path.exists(representative_days_path): os.makedirs(representative_days_path) folder_path = os.path.join(sys.path[0], str(city)) GTI_distribution = pd.read_csv( os.path.join(folder_path, 'best_fit_GTI.csv')) wind_speed_distribution = pd.read_csv( os.path.join(folder_path, 'best_fit_wind_speed.csv')) range_data = ['low', 'medium', 'high'] scenario_genrated = {} scenario_probability = defaultdict(list) solar_probability = defaultdict(list) wind_probability = defaultdict(list) for i in range(8760): if GTI_distribution['Mean'][i] == 0: solar_probability['low'].append(1 / 3) solar_probability['medium'].append(1 / 3) solar_probability['high'].append(1 / 3) ## If Solar GTI is normal: from Rice & Miller low = 0.112702 = (x-loc)/scale --> =tick elif GTI_distribution['Best fit'][i] == 'norm': solar_probability['low'].append(0.166667) solar_probability['medium'].append(0.666667) solar_probability['high'].append(0.166667) ## If Solar GTI is uniform: from Rice & Miller low = 0.112702 (i - loc)/scale elif GTI_distribution['Best fit'][i] == 'uniform': solar_probability['low'].append(0.277778) solar_probability['medium'].append(0.444444) solar_probability['high'].append(0.277778) ## If Solar GTI is expon: from Rice & Miller low = 0.415775 (i - loc)/scale, scale/scale) elif GTI_distribution['Best fit'][i] == 'expon': solar_probability['low'].append(0.711093) solar_probability['medium'].append(0.278518) solar_probability['high'].append(0.010389) if wind_speed_distribution['Mean'][i] == 0: wind_probability['low'].append(1 / 3) wind_probability['medium'].append(1 / 3) wind_probability['high'].append(1 / 3) ## If Solar GTI is normal: from Rice & Miller low = 0.112702 = (x-loc)/scale --> =tick elif wind_speed_distribution['Best fit'][i] == 'norm': wind_probability['low'].append(0.166667) wind_probability['medium'].append(0.666667) wind_probability['high'].append(0.166667) ## If Solar GTI is uniform: from Rice & Miller low = 0.112702 (i - loc)/scale elif wind_speed_distribution['Best fit'][i] == 'uniform': wind_probability['low'].append(0.277778) wind_probability['medium'].append(0.444444) wind_probability['high'].append(0.277778) ## If Solar GTI is expon: from Rice & Miller low = 0.415775 (i - loc)/scale, scale/scale) elif wind_speed_distribution['Best fit'][i] == 'expon': wind_probability['low'].append(0.711093) wind_probability['medium'].append(0.278518) wind_probability['high'].append(0.010389) p_solar = nested_dict() p_wind = nested_dict() scenario_number = {} num_scenario = 0 #laod the energy deamnd, solar, wind, and electricity emissions from scenario generation file for i_demand in range_data: for i_solar in range_data: for i_wind in range_data: for i_emission in range_data: if i_demand == 'low': p_demand = 0.277778 elif i_demand == 'medium': p_demand = 0.444444 elif i_demand == 'high': p_demand = 0.277778 if i_emission == 'low': p_emission = 0.166667 elif i_emission == 'medium': p_emission = 0.666667 elif i_emission == 'high': p_emission = 0.166667 for day in range(365): p_solar[i_solar][day] = sum( solar_probability[i_solar][day * 24:(day + 1) * 24] ) / (sum(solar_probability[range_data[0]][day * 24:( day + 1) * 24]) + sum(solar_probability[ range_data[1]][day * 24:(day + 1) * 24]) + sum(solar_probability[range_data[2]] [day * 24:(day + 1) * 24])) p_wind[i_wind][day] = sum( wind_probability[i_wind][day * 24:(day + 1) * 24] ) / (sum(wind_probability[range_data[0]][day * 24:( day + 1) * 24]) + sum(wind_probability[ range_data[1]][day * 24:(day + 1) * 24]) + sum(wind_probability[range_data[2]] [day * 24:(day + 1) * 24])) scenario_probability['D:' + i_demand + '/S:' + i_solar + '/W:' + i_wind + '/C:' + i_emission].append( p_demand * p_solar[i_solar][day] * p_wind[i_wind][day] * p_emission) scenario_number['D:' + i_demand + '/S:' + i_solar + '/W:' + i_wind + '/C:' + i_emission] = num_scenario num_scenario = num_scenario + 1 scenario_genrated['D:' + i_demand + '/S:' + i_solar + '/W:' + i_wind + '/C:' + i_emission] = pd.read_csv(os.path.join( save_path, 'D_' + i_demand + '_S_' + i_solar + '_W_' + i_wind + '_C_' + i_emission + '.csv'), header=None) features_scenarios = defaultdict(list) features_scenarios_list = [] features_probability_list = [] features_scenarios_nested = nested_dict() k = 0 days = 365 for scenario in scenario_genrated.keys(): scenario_genrated[scenario] = scenario_genrated[scenario] for i in range(days): if i == 0: data = scenario_genrated[scenario][1:25] else: data = scenario_genrated[scenario][25 + (i - 1) * 24:25 + (i) * 24] #Total electricity, heating, solar, wind, EF. daily_list = list( chain(data[0].astype('float', copy=False), data[1].astype('float', copy=False), data[2].astype('float', copy=False), data[3].astype('float', copy=False), data[6].astype('float', copy=False))) features_scenarios[k * days + i] = daily_list features_scenarios_nested[scenario][i] = features_scenarios[k * days + i] features_scenarios_list.append(features_scenarios[k * days + i]) features_probability_list.append(scenario_probability[scenario][i]) k = k + 1 A = np.asarray(features_scenarios_list) #Convert the dictionary of features to Series standardization_data = StandardScaler() A_scaled = standardization_data.fit_transform(A) # Create a PCA instance: pca pca = PCA(n_components=int(editable_data['PCA numbers'])) principalComponents = pca.fit(A_scaled) scores_pca = pca.transform(A_scaled) #print('Score of features', scores_pca) #print('Explained variance ratio',pca.explained_variance_ratio_) # Plot the explained variances # Save components to a DataFrame features = range(pca.n_components_) search_optimum_feature = editable_data['Search optimum PCA'] SMALL_SIZE = 10 MEDIUM_SIZE = 12 BIGGER_SIZE = 14 plt.rcParams['axes.facecolor'] = 'white' plt.rcParams['axes.grid'] = False plt.rcParams['axes.edgecolor'] = 'black' if search_optimum_feature == 'yes': print('Defining the optimum number of features in the PCA method: ') fig, ax = plt.subplots(figsize=(12, 6)) ax.bar(features, pca.explained_variance_ratio_.cumsum(), color='tab:blue') ax.set_xlabel('PCA features', fontsize=BIGGER_SIZE) ax.set_ylabel('Cumulative explained variance', fontsize=BIGGER_SIZE) ax.set_xticks(features) ax.set_title( 'The user should set a limit on the explained variance value and then, select the optimum number of PCA features', fontsize=BIGGER_SIZE) plt.savefig(os.path.join(sys.path[0], 'Explained variance vs PCA features.png'), dpi=300, facecolor='w') plt.close() print( '"Explained variance vs PCA features" figure is saved in the directory folder' ) print( 'You can use the figure to select the optimum number of features') print( 'You should enter the new optimum number of features in EditableFile.csv file and re-run this part' ) plt.close() PCA_components = pd.DataFrame(scores_pca) inertia_list = [] search_optimum_cluster = editable_data[ 'Search optimum clusters'] # if I want to search for the optimum number of clusters: 1 is yes, 0 is no cluster_range = range(2, 20, 1) if search_optimum_cluster == 'yes': print('Defining the optimum number of clusters: ') fig, ax = plt.subplots(figsize=(12, 6)) for cluster_numbers in cluster_range: kmedoids = KMedoids(n_clusters=cluster_numbers, init="random", max_iter=1000, random_state=0).fit(scores_pca) inertia_list.append(kmedoids.inertia_) plt.scatter(cluster_numbers, kmedoids.inertia_) print('Cluster number:', cluster_numbers, ' Inertia of the cluster:', int(kmedoids.inertia_)) ax.set_xlabel('Number of clusters', fontsize=BIGGER_SIZE) ax.set_ylabel('Inertia', fontsize=BIGGER_SIZE) ax.set_title( 'The user should use "Elbow method" to select the number of optimum clusters', fontsize=BIGGER_SIZE) ax.plot(list(cluster_range), inertia_list) ax.set_xticks(np.arange(2, 20, 1)) plt.savefig(os.path.join(sys.path[0], 'Inertia vs Clusters.png'), dpi=300, facecolor='w') plt.close() print('"Inertia vs Clusters" figure is saved in the directory folder') print( 'You can use the figure to select the optimum number of clusters') print( 'You should enter the new optimum number of clusters in EditableFile.csv file and re-run this part' ) cluster_numbers = int(editable_data['Cluster numbers']) kmedoids_org = KMedoids(n_clusters=cluster_numbers, init="random", max_iter=1000, random_state=4).fit(A) kmedoids = KMedoids(n_clusters=cluster_numbers, init="random", max_iter=1000, random_state=4).fit(scores_pca) label = kmedoids.fit_predict(scores_pca) #filter rows of original data probability_label = defaultdict(list) index_label = defaultdict(list) index_label_all = [] filtered_label = {} for i in range(cluster_numbers): filtered_label[i] = scores_pca[label == i] index_cluster = np.where(label == i) if len(filtered_label[i]) != 0: index_cluster = index_cluster[0] for j in index_cluster: probability_label[i].append(features_probability_list[j]) index_label[i].append(j) index_label_all.append(j) else: probability_label[i].append(0) sum_probability = [] for key in probability_label.keys(): sum_probability.append(sum(probability_label[key])) plt.scatter(filtered_label[i][:, 0], filtered_label[i][:, 1]) plt.xlabel('PCA 1') plt.ylabel('PCA 2') #plt.show() plt.close() plt.scatter(PCA_components[0], PCA_components[1], alpha=.1, color='black') plt.xlabel('PCA 1') plt.ylabel('PCA 2') #plt.show() plt.close() #print(kmedoids.predict([[0,0,0], [4,4,4]])) #print(kmedoids.cluster_centers_,kmedoids.cluster_centers_[0],len(kmedoids.cluster_centers_)) scores_pca_list = {} clusters = {} clusters_list = [] data_labels = {} for center in range(len(kmedoids.cluster_centers_)): clusters['cluster centers ' + str(center)] = kmedoids.cluster_centers_[center] clusters_list.append(kmedoids.cluster_centers_[center].tolist()) for scenario in range(len(scores_pca)): scores_pca_list[scenario] = scores_pca[scenario].tolist() scores_pca_list[scenario].insert(0, kmedoids.labels_[scenario]) data_labels['labels ' + str(scenario)] = scores_pca_list[scenario] df_clusters = pd.DataFrame(clusters) df_labels = pd.DataFrame(data_labels) df_clusters.to_csv(os.path.join( representative_days_path, 'cluster_centers_C_' + str(len(kmedoids.cluster_centers_)) + '_L_' + str(len(kmedoids.labels_)) + '.csv'), index=False) df_labels.to_csv(os.path.join( representative_days_path, 'labels_C_' + str(len(kmedoids.cluster_centers_)) + '_L_' + str(len(kmedoids.labels_)) + '.csv'), index=False) #Reversing PCA using two methods: #Reversing the cluster centers using method 1 (their results are the same) clusters_reverse = pca.inverse_transform(kmedoids.cluster_centers_) cluster_reverse_new = [] #Reversing the cluster centers using method 2 (their results are the same) scores_pca_reverse = pca.inverse_transform(scores_pca) for cluster_iterate in range(len(clusters_list)): for pca_days in range(len(scores_pca)): results_comparison = np.array_equal( np.array(clusters_list[cluster_iterate]), np.array(scores_pca[pca_days])) if results_comparison: cluster_reverse_new.append(scores_pca_reverse[pca_days]) Scenario_generated_new = standardization_data.inverse_transform( clusters_reverse) #print('15 representative days',clusters_reverse[0][0],Scenario_generated_new[0][0],standardization_data.mean_[0],standardization_data.var_[0]) representative_day_all = {} total_labels = [] represent_gaps = {} scenario_data = {} for key in filtered_label.keys(): total_labels.append(len(filtered_label[key])) #print(len(probability_label[0])) 1990 #print(len(filtered_label[0])) 1990 for representative_day in range(len(Scenario_generated_new)): represent_gaps = {} scenario_data = {} for i in range(120): if Scenario_generated_new[representative_day][i] < 0: Scenario_generated_new[representative_day][i] = 0 for k in range(5): # 5 uncertain inputs scenario_data[k] = Scenario_generated_new[representative_day][ 24 * k:24 * (k + 1)].copy() min_non_z = np.min(np.nonzero(scenario_data[k])) max_non_z = np.max(np.nonzero(scenario_data[k])) represent_gaps[k] = [ i for i, x in enumerate(scenario_data[k][min_non_z:max_non_z + 1]) if x == 0 ] ranges = sum( (list(t) for t in zip(represent_gaps[k], represent_gaps[k][1:]) if t[0] + 1 != t[1]), []) iranges = iter(represent_gaps[k][0:1] + ranges + represent_gaps[k][-1:]) #print('Present gaps are: ', representative_day,k, 'gaps', ', '.join([str(n) + '-' + str(next(iranges)) for n in iranges])) iranges = iter(represent_gaps[k][0:1] + ranges + represent_gaps[k][-1:]) for n in iranges: next_n = next(iranges) if (next_n - n ) == 0: #for data gaps of 1 hour, get the average value scenario_data[k][ n + min_non_z] = (scenario_data[k][min_non_z + n + 1] + scenario_data[k][min_non_z + n - 1]) / 2 elif (next_n - n) > 0 and ( next_n - n ) <= 6: #for data gaps of 1 hour to 4 hr, use interpolation and extrapolation f_interpol_short = interp1d([n - 1, next_n + 1], [ scenario_data[k][min_non_z + n - 1], scenario_data[k][min_non_z + next_n + 1] ]) for m in range(n, next_n + 1): scenario_data[k][m + min_non_z] = f_interpol_short(m) data_represent_days_modified = { 'Electricity total (kWh)': scenario_data[0], 'Heating (kWh)': scenario_data[1], 'GTI (Wh/m^2)': scenario_data[2], 'Wind Speed (m/s)': scenario_data[3], 'Electricity EF (kg/kWh)': scenario_data[4], 'Labels': len(filtered_label[representative_day]), 'Percent %': round( sum_probability[representative_day] * 100 / sum(sum_probability), 4) } #print(np.mean(Scenario_generated_new[representative_day][0:24])) df_represent_days_modified = pd.DataFrame(data_represent_days_modified) df_represent_days_modified.to_csv(os.path.join( representative_days_path, 'Represent_days_modified_' + str(representative_day) + '.csv'), index=False) print('cluster evaluation starts') max_heating_scenarios_nested = nested_dict() max_electricity_scenarios_nested = nested_dict() total_heating_scenarios = [] total_electricity_scenarios = [] max_electricity_scenarios_nested_list = defaultdict(list) max_heating_scenarios_nested_list = defaultdict(list) accuracy_design_day = 0.99 design_day_heating = [] design_day_electricity = [] representative_day_max = {} electricity_design_day = {} heating_design_day = {} i_demand = range_data[2] i_solar = range_data[1] i_wind = range_data[1] i_emission = range_data[1] scenario = 'D:' + i_demand + '/S:' + i_solar + '/W:' + i_wind + '/C:' + i_emission for day in range(365): for i in range(24): k_elect = 0 list_k_electricity = [] k_heat = 0 list_k_heating = [] for represent in range(cluster_numbers): representative_day_max[represent] = pd.read_csv( representative_days_path + '\Represent_days_modified_' + str(represent) + '.csv') electricity_demand = representative_day_max[represent][ 'Electricity total (kWh)'] #kWh heating_demand = representative_day_max[represent][ 'Heating (kWh)'] #kWh if features_scenarios_nested[scenario][day][0:24][ i] > electricity_demand[i]: k_elect = 1 list_k_electricity.append(k_elect) k_elect = 0 if features_scenarios_nested[scenario][day][24:48][ i] > heating_demand[i]: k_heat = 1 list_k_heating.append(k_heat) k_heat = 0 if sum( list_k_electricity ) == cluster_numbers: #This hour does not meet by any of the representative days max_electricity_scenarios_nested_list[i].append( features_scenarios_nested[scenario][day][0:24][i]) total_electricity_scenarios.append( features_scenarios_nested[scenario][day][0:24][i]) if sum( list_k_heating ) == cluster_numbers: #This hour does not meet by any of the representative days max_heating_scenarios_nested_list[i].append( features_scenarios_nested[scenario][day][24:48][i]) total_heating_scenarios.append( features_scenarios_nested[scenario][day][24:48][i]) total_electricity_scenarios.sort(reverse=True) total_heating_scenarios.sort(reverse=True) max_electricity_hour = total_electricity_scenarios[35] max_heating_hour = total_heating_scenarios[35] i_demand = range_data[2] i_solar = range_data[1] i_wind = range_data[1] i_emission = range_data[1] scenario = 'D:' + i_demand + '/S:' + i_solar + '/W:' + i_wind + '/C:' + i_emission design_day_heating = [] design_day_electricity = [] for i in range(24): design_day_electricity.append( np.max([ j for j in max_electricity_scenarios_nested_list[i] if j < max_electricity_hour ])) design_day_heating.append( np.max([ j for j in max_heating_scenarios_nested_list[i] if j < max_heating_hour ])) representative_day_max = {} electricity_demand_total = defaultdict(list) heating_demand_total = defaultdict(list) heating_demand_max = {} electricity_demand_max = {} for represent in range(cluster_numbers): representative_day_max[represent] = pd.read_csv( representative_days_path + '\Represent_days_modified_' + str(represent) + '.csv') electricity_demand = representative_day_max[represent][ 'Electricity total (kWh)'] #kWh heating_demand = representative_day_max[represent][ 'Heating (kWh)'] #kWh #hours_representative_day= round(sum_probability[representative_day]/sum(sum_probability),4)*8760 heating_demand_max[represent] = np.mean(heating_demand) electricity_demand_max[represent] = np.mean(electricity_demand) high_electricity_index = [] high_heating_index = [] high_electricity_value = [] high_heating_value = [] key_max_electricity = max(electricity_demand_max, key=electricity_demand_max.get) key_max_heating = max(heating_demand_max, key=heating_demand_max.get) for key, value in max_electricity_scenarios_nested.items(): for inner_key, inner_value in max_electricity_scenarios_nested[ key].items(): if inner_value > electricity_demand_max[key_max_electricity]: high_electricity_index.append(scenario_number[key] * 365 + inner_key) high_electricity_value.append(inner_value) for key, value in max_heating_scenarios_nested.items(): for inner_key, inner_value in max_heating_scenarios_nested[key].items( ): if inner_value > heating_demand_max[key_max_heating]: high_heating_index.append(scenario_number[key] * 365 + inner_key) high_heating_value.append(inner_value) sum_probability.append(0.5 * len(total_electricity_scenarios) / len(index_label_all) * 365) sum_probability.append( len(total_heating_scenarios) / len(index_label_all) * 365) filtered_label[cluster_numbers] = len(total_electricity_scenarios) filtered_label[cluster_numbers + 1] = len(total_heating_scenarios) representative_day = cluster_numbers data_represent_days_modified = { 'Electricity total (kWh)': design_day_electricity, 'Heating (kWh)': representative_day_max[key_max_electricity]['Heating (kWh)'], 'GTI (Wh/m^2)': representative_day_max[key_max_electricity]['GTI (Wh/m^2)'], 'Wind Speed (m/s)': representative_day_max[key_max_electricity]['Wind Speed (m/s)'], 'Electricity EF (kg/kWh)': representative_day_max[key_max_electricity]['Electricity EF (kg/kWh)'], 'Labels': filtered_label[cluster_numbers], 'Percent %': round(sum_probability[representative_day] * 100 / sum(sum_probability), 4) } df_represent_days_modified = pd.DataFrame(data_represent_days_modified) df_represent_days_modified.to_csv(representative_days_path + '\Represent_days_modified_' + str(representative_day) + '.csv', index=False) representative_day = cluster_numbers + 1 data_represent_days_modified = { 'Electricity total (kWh)': representative_day_max[key_max_heating]['Electricity total (kWh)'], 'Heating (kWh)': design_day_heating, 'GTI (Wh/m^2)': representative_day_max[key_max_heating]['GTI (Wh/m^2)'], 'Wind Speed (m/s)': representative_day_max[key_max_heating]['Wind Speed (m/s)'], 'Electricity EF (kg/kWh)': representative_day_max[key_max_heating]['Electricity EF (kg/kWh)'], 'Labels': filtered_label[cluster_numbers + 1], 'Percent %': round(sum_probability[representative_day] * 100 / sum(sum_probability), 4) } df_represent_days_modified = pd.DataFrame(data_represent_days_modified) df_represent_days_modified.to_csv(representative_days_path + '\Represent_days_modified_' + str(representative_day) + '.csv', index=False) for representative_day in range(len(Scenario_generated_new)): represent_gaps = {} scenario_data = {} for i in range(120): #24*5=120 features in each day if Scenario_generated_new[representative_day][i] < 0: Scenario_generated_new[representative_day][i] = 0 for k in range(5): # 5 uncertain inputs scenario_data[k] = Scenario_generated_new[representative_day][ 24 * k:24 * (k + 1)].copy() min_non_z = np.min(np.nonzero(scenario_data[k])) max_non_z = np.max(np.nonzero(scenario_data[k])) represent_gaps[k] = [ i for i, x in enumerate(scenario_data[k][min_non_z:max_non_z + 1]) if x == 0 ] ranges = sum( (list(t) for t in zip(represent_gaps[k], represent_gaps[k][1:]) if t[0] + 1 != t[1]), []) iranges = iter(represent_gaps[k][0:1] + ranges + represent_gaps[k][-1:]) #print('Present gaps are: ', representative_day,k, 'gaps', ', '.join([str(n) + '-' + str(next(iranges)) for n in iranges])) iranges = iter(represent_gaps[k][0:1] + ranges + represent_gaps[k][-1:]) for n in iranges: next_n = next(iranges) if (next_n - n ) == 0: #for data gaps of 1 hour, get the average value scenario_data[k][ n + min_non_z] = (scenario_data[k][min_non_z + n + 1] + scenario_data[k][min_non_z + n - 1]) / 2 elif (next_n - n) > 0 and ( next_n - n ) <= 6: #for data gaps of 1 hour to 4 hr, use interpolation and extrapolation f_interpol_short = interp1d([n - 1, next_n + 1], [ scenario_data[k][min_non_z + n - 1], scenario_data[k][min_non_z + next_n + 1] ]) for m in range(n, next_n + 1): scenario_data[k][m + min_non_z] = f_interpol_short(m) data_represent_days_modified = { 'Electricity total (kWh)': scenario_data[0], 'Heating (kWh)': scenario_data[1], 'GTI (Wh/m^2)': scenario_data[2], 'Wind Speed (m/s)': scenario_data[3], 'Electricity EF (kg/kWh)': scenario_data[4], 'Labels': len(filtered_label[representative_day]), 'Percent %': round( sum_probability[representative_day] * 100 / sum(sum_probability), 4) } #print(np.mean(Scenario_generated_new[representative_day][0:24])) df_represent_days_modified = pd.DataFrame(data_represent_days_modified) df_represent_days_modified.to_csv(representative_days_path + '\Represent_days_modified_' + str(representative_day) + '.csv', index=False)
def Kmedoids_clustering(df, tweets_column, embeddings_df, n_clusters, max_iter= 1000) : kmeans = KMedoids(n_clusters=n_clusters, init='k-medoids++', metric = 'cosine', max_iter=max_iter) cluster_ids = kmeans.fit_predict(embeddings_df) result_df = pd.DataFrame({'tweets' : df[tweets_column], 'topic_cluster' : cluster_ids }) return result_df
import math from sklearn import metrics dataset = pd.read_csv('Mall_Customers.csv') X = dataset.iloc[:, [3,4]].values plt.scatter(X[:,0], X[:,1], s=100, color="blue") plt.grid() plt.show() kMedoids = KMedoids(n_clusters = 5, #numero de clusters init = 'random', max_iter = 300) #numero máximo de iteraciones labels = kMedoids.fit_predict(X) plt.scatter(X[:,0], X[:,1], s = 100,c = labels) #posicion de los ejes x y y plt.grid() #funcion que grafica la malla plt.scatter(kMedoids.cluster_centers_[:,0],kMedoids.cluster_centers_[:,1], s = 300, c = 'red') #posição de cada centroide no gráfico plt.title("K-medoid") plt.show() #si es que usamos nuestro coeficiente de la silueta # calculamos todos los coeficientes para cada distribucion # de clusters y escogemos el que est'e m'as cerca de 1 css_vector = [] for i in range(2, 11): kMedoids = KMedoids(n_clusters = i, init = 'random')
wcss = [] for i in range(1, 7): kmedoids = KMedoids(n_clusters=i, random_state=0) kmedoids.fit(dataset_questions_pca) sse = kmedoids.inertia_ print("Clusters", i, "SSE", sse) wcss.append(sse) plt.plot(range(1, 7), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() #Aplico k-means sobre el conjunto brindado por pca kmedoids = KMedoids(n_clusters=3, random_state=0) y_kmedoids = kmedoids.fit_predict(dataset_questions_pca) initial_centroids = kmedoids.cluster_centers_ print("Centroides iniciales") for instance in initial_centroids: print(instance) silhouette_avg = metrics.silhouette_score(scaled_data, y_kmedoids) print('El coeficiente de silueta del agrupamiento es = ', silhouette_avg) plt.scatter(dataset_questions_pca[y_kmedoids == 0, 0], dataset_questions_pca[y_kmedoids == 0, 1], c='red', label='Cluster 1') plt.scatter(dataset_questions_pca[y_kmedoids == 1, 0], dataset_questions_pca[y_kmedoids == 1, 1],
def test_elbow(X, dtw_value, seed): print(len(X)) distortions = [] silhouette_value = [] dists = dtw_value print(dists) if seed == -1: for seed in range(0, 21): cur_silhouette = [seed] cur_distortions = [seed] for i in range(2, 15): print(i) km = KMedoids(n_clusters=i, random_state=seed, metric="precomputed", init='k-medoids++', max_iter=30000) km.fit(dists) # 记录误差和 cur_distortions.append(km.inertia_) y_pred = km.fit_predict(dists) np.fill_diagonal(dists, 0) score = silhouette_score(dists, y_pred, metric="precomputed") cur_silhouette.append(score) distortions.append(cur_distortions) silhouette_value.append(cur_silhouette) with open(r".//res//grid_distortions_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in distortions: writer.writerow(row) print(row) with open(r".//res//grid_silhouette_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in silhouette_value: writer.writerow(row) print(row) else: csv_reader = csv.reader( open(".//res//grid_distortions_destination.csv", encoding='UTF-8')) for row in csv_reader: distortions.append([float(item) for item in row]) csv_reader = csv.reader( open(".//res//grid_silhouette_destination.csv", encoding='UTF-8')) for row in csv_reader: silhouette_value.append([float(item) for item in row]) chosen_distortions = distortions[seed][1:] chosen_silhouette = silhouette_value[seed][1:] plt.figure(1) plt.plot(range(2, 15), chosen_distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.savefig(r'.//res//grid_distortions_destination.png') plt.close() plt.figure(1) plt.bar(range(2, 15), chosen_silhouette, color='grey') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.savefig(r'.//res//grid_silhouette_destination.png')
wcss = [] for i in range(1, 7): kmedoids = KMedoids(n_clusters=i, random_state=0) kmedoids.fit(scaled_data) sse = kmedoids.inertia_ #print("Clusters",i,"SSE",sse) wcss.append(sse) plt.plot(range(1, 7), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() #Aplico k-means sobre el conjunto brindado por pca kmedoids = KMedoids(n_clusters=3, random_state=0) y_kmedoids = kmedoids.fit_predict(scaled_data) initial_centroids = kmedoids.cluster_centers_ etiquetas = kmedoids.labels_ print("Centroides iniciales") initial_centroids_desnormalize = [] for instance in initial_centroids: #print(instance) temp = [] for i in range(len(instance)): media = descripcion[columnas[i]]['mean'] std = descripcion[columnas[i]]['std'] temp.append(round(instance[i] * std + media)) initial_centroids_desnormalize.append(temp) #print(temp) """