def cluster(inputs, number_clusters=12, norm=2, time_limit=300, mip_gap=0.0): """ """ # Determine time steps per day len_day = int(inputs.shape[1] / 365) # Manipulate inputs # Initialize arrays inputsTransformed = [] inputsScaled = [] inputsScaledTransformed = [] # Fill and reshape # Scaling to values between 0 and 1, thus all inputs shall have the same # weight and will be clustered equally in terms of quality for i in range(inputs.shape[0]): vals = inputs[i, :] temp = (vals - np.min(vals)) / (np.max(vals) - np.min(vals)) inputsScaled.append(temp) inputsScaledTransformed.append(temp.reshape((len_day, 365), order="F")) inputsTransformed.append(vals.reshape((len_day, 365), order="F")) # Put the scaled and reshaped inputs together L = np.concatenate(tuple(inputsScaledTransformed)) # Compute distances d = _distances(L, norm) # Execute optimization model (y, z, obj) = k_medoids.k_medoids(d, number_clusters, time_limit, mip_gap) # Retain typical days nc = np.zeros_like(y) typicalDays = [] # nc contains how many days are there in each cluster nc = [] for i in xrange(len(y)): temp = np.sum(z[i, :]) if temp > 0: nc.append(temp) typicalDays.append([ins[:, i] for ins in inputsTransformed]) typicalDays = np.array(typicalDays) nc = np.array(nc, dtype="int") nc_cumsum = np.cumsum(nc) * len_day # Construct (yearly) load curves # ub = upper bound, lb = lower bound clustered = np.zeros_like(inputs) for i in xrange(len(nc)): if i == 0: lb = 0 else: lb = nc_cumsum[i - 1] ub = nc_cumsum[i] for j in xrange(len(inputsTransformed)): clustered[j, lb:ub] = np.tile(typicalDays[i][j], nc[i]) # Scaling to preserve original demands sums_inputs = [np.sum(inputs[j, :]) for j in range(inputs.shape[0])] scaled = np.array( [nc[day] * typicalDays[day, :, :] for day in range(number_clusters)]) sums_scaled = [np.sum(scaled[:, j, :]) for j in range(inputs.shape[0])] scaling_factors = [ sums_inputs[j] / sums_scaled[j] for j in range(inputs.shape[0]) ] scaled_typ_days = [ scaling_factors[j] * typicalDays[:, j, :] for j in range(inputs.shape[0]) ] return (scaled_typ_days, nc, z)
def cluster(inputs, number_clusters=12, norm=2, time_limit=300, mip_gap=0.0): """ Cluster a set of inputs into clusters by solving a k-medoid problem. Parameters ---------- inputs : 2-dimensional array First dimension: Number of different input types. Second dimension: Values for each time step of interes. number_clusters : integer, optional How many clusters shall be computed? norm : integer, optional Compute the distance according to this norm. 2 is the standard Euklidean-norm. time_limit : integer, optional Time limit for the optimization in seconds mip_gap : float, optional Optimality tolerance (0: proven global optimum) Returns ------- scaled_typ_days : Scaled typical demand days. The scaling is based on the annual demands. nc : array_like Weighting factors of each cluster z : 2-dimensional array Mapping of each day to the clusters """ # Determine time steps per day len_day = int(inputs.shape[1] / 365) # Manipulate inputs # Initialize arrays inputsTransformed = [] inputsScaled = [] inputsScaledTransformed = [] # Fill and reshape # Scaling to values between 0 and 1, thus all inputs shall have the same # weight and will be clustered equally in terms of quality for i in range(inputs.shape[0]): vals = inputs[i,:] temp = (vals - np.min(vals)) / (np.max(vals) - np.min(vals)) inputsScaled.append(temp) inputsScaledTransformed.append(temp.reshape((len_day, 365), order="F")) inputsTransformed.append(vals.reshape((len_day, 365), order="F")) # Put the scaled and reshaped inputs together L = np.concatenate(tuple(inputsScaledTransformed)) # Compute distances d = _distances(L, norm) # Execute optimization model (y, z, obj) = k_medoids.k_medoids(d, number_clusters, time_limit, mip_gap) # Section 2.3 and retain typical days nc = np.zeros(number_clusters) typicalDays = [] # nc contains how many days are there in each cluster nc = [] for i in xrange(len(y)): temp = np.sum(z[i,:]) if temp > 0: nc.append(temp) typicalDays.append([ins[:,i] for ins in inputsTransformed]) typicalDays = np.array(typicalDays) nc = np.array(nc, dtype="int") nc_cumsum = np.cumsum(nc) * len_day # Construct (yearly) load curves # ub = upper bound, lb = lower bound clustered = np.zeros_like(inputs) for i in xrange(len(nc)): if i == 0: lb = 0 else: lb = nc_cumsum[i-1] ub = nc_cumsum[i] for j in xrange(len(inputsTransformed)): clustered[j, lb:ub] = np.tile(typicalDays[i][j], nc[i]) # Scaling to preserve original demands sums_inputs = [np.sum(inputs[j,:]) for j in range(inputs.shape[0])] scaled = np.array([nc[day] * typicalDays[day,:,:] for day in range(number_clusters)]) sums_scaled = [np.sum(scaled[:,j,:]) for j in range(inputs.shape[0])] scaling_factors = [sums_inputs[j] / sums_scaled[j] for j in range(inputs.shape[0])] scaled_typ_days = [scaling_factors[j] * typicalDays[:,j,:] for j in range(inputs.shape[0])] return (scaled_typ_days, nc, z)
import k_medoids as km ''' finput_user_similarity = "Data/user_similarity_matrix" finput_cluster_number = 200 foutput_user_cluster_set = "Data/user_cluster_set" ''' if (__name__ == '__main__'): # data path finput_user_similarity = sys.argv[1] finput_cluster_number = int(sys.argv[2]) foutput_user_cluster_set = sys.argv[3] # read into user similarity matrix user_similarity_matrix = rw.readffile(finput_user_similarity) # k-medoids user_cluster_set = km.k_medoids(user_similarity_matrix.values, K=finput_cluster_number, max_iterations=20) print("\ndone!") rw.write2file(user_cluster_set, foutput_user_cluster_set) print("file saved done!") print("top 20% of user cluster:") length = [] for lst in user_cluster_set: length.append(len(lst)) length.sort(reverse=True) print(length[0:int(len(length) * 0.2)])