def fill_up(self, num_bins, iterations=10, fill_up_plots=False, point_plots=False, RO=True, t=1): # consider every label seperately label_confidence = [] for label in self.D.labels: label_idx = self.D.labels.index(label) '''collect training data''' data = self.D.X_b_train[self.D.Y_b_train == label] '''remove outliers, rotate data''' if RO: data = Transformations.remove_outliers_lof(data) trafo = self.trafo() data = trafo.transform(data) cdfs_scaled = np.empty((len(data[0]), num_bins)) fitted_cdf = np.empty((len(data[0]), num_bins)) fitted_ = np.empty((len(data[0]), num_bins)) num_fill_up = 0 data_range = [] DE_list = [] if fill_up_plots: f, ax = plt.subplots(nrows=1, ncols=len(data[0]), figsize=(6, 2.5)) # consider every dimension for line in range(len(data[0])): '''project onto line, determine borders''' d = data[:, line] d_min = min(d) d_max = max(d) data_range.append([d_min, d_max]) '''define Density Estimator here!''' DE_list.append(self.DE(num_bins)) DE_list[line].estimate(d, d_min, d_max) '''estimate distribution''' fitted = self.density_func.fit(DE_list[line].mids, DE_list[line].values, d) fitted_[line] = copy.deepcopy(fitted) fitted_cdf[line] = np.cumsum(fitted) fitted_cdf[line] = fitted_cdf[line] / fitted_cdf[line][-1] '''to be filled up: the differences between the distribution curve and the histogram''' diff = fitted - DE_list[line].values '''number of points to add''' num_points_line = (len(d) / sum(DE_list[line].values)) * sum(diff) num_fill_up = max(num_fill_up, num_points_line) '''probability distribution for the fill-up''' if sum(diff) == 0: cdfs_scaled[line] = [0] * num_bins else: diff = diff / sum(diff) diff = [max(diff[i], 0) for i in range(len(diff))] cdfs_scaled[line] = np.cumsum(diff) cdfs_scaled[line] = ( cdfs_scaled[line] / cdfs_scaled[line][-1]) * num_points_line if fill_up_plots: barWidth = DE_list[line].mids[1] - DE_list[line].mids[0] fill = fitted_[line] - DE_list[line].values ax[line].bar(DE_list[line].mids, DE_list[line].values, label='data', color='teal', width=barWidth) ax[line].bar(DE_list[line].mids, [max(fill[i], 0) for i in range(len(fill))], bottom=DE_list[line].values, label='fill up', color='goldenrod', width=barWidth, hatch="...", edgecolor="white") ax[line].plot(DE_list[line].mids, fitted_[line], label='fitted', c='mediumvioletred', linewidth=2) ax[line].get_xaxis().set_ticks([]) ax[line].get_yaxis().set_ticks([]) if fill_up_plots: ax[-1].legend() plt.show() # f.savefig('Results/Example_cluster_distr.pdf', format='pdf', dpi=1200, bbox_inches='tight') # determine the number of added points in total: max over dimensions num_fill_up = int(num_fill_up) if num_fill_up == 0: label_confidence.append(0) continue # best out of 10: go for the result with the highest confidence best_conf = 0 leftover_points = [] # kNN_rnd_dist, kNN_rnd_std = confidence_kNN_rnd_coeff(data_range, num_fill_up) kNN_rnd_dist, kNN_rnd_std = Confidence.confidence_kNN_train_sized_coeff( data, num_fill_up) for it in range(iterations): points = np.empty((num_fill_up, 0)) # generate points for line in range(len(data[0])): '''adjust cdf (in case there have to be more points added because of other lines)''' distr_scaled = fitted_cdf[line] * max( (num_fill_up - cdfs_scaled[line][-1]), 0) cdf = cdfs_scaled[line] + distr_scaled cdf = cdf / cdf[-1] # normalize '''generate random values according to the cdf''' values = np.random.rand(num_fill_up) value_bins = np.searchsorted(cdf, values) coords = np.array([ random.uniform(DE_list[line].grid[value_bins[i]], DE_list[line].grid[value_bins[i] + 1]) for i in range(num_fill_up) ]).reshape(num_fill_up, 1) points = np.concatenate((points, coords), axis=1) '''compute the confidence of the result''' if len(points) < 20: conf_b, conf_a, l_p = (0, 0, [[]]) else: conf_b, conf_a, l_p = Confidence.confidence_kNN_rnd( points, kNN_rnd_dist, t * kNN_rnd_std) # add the points to the data set if conf_a > best_conf: best_conf = conf_a leftover_points = copy.deepcopy(l_p) # leftover_points = points if point_plots: plt.figure(it) plt.scatter(data[:, 0], data[:, 1], c=self.colors[label_idx], alpha=0.2, s=3) plt.scatter(points[:, 0], points[:, 1], c='red', alpha=0.8, s=8) if len(l_p) > 0 and len(l_p[0]) > 0: plt.scatter(l_p[:, 0], l_p[:, 1], c=self.colors[label_idx], alpha=0.8, s=8) plt.show() if len(data[0]) > 2: plt.figure(it * 100) plt.scatter(data[:, 0], data[:, 2], c=self.colors[label_idx], alpha=0.2, s=3) plt.scatter(points[:, 0], points[:, 2], c='red', alpha=0.8, s=8) if len(l_p) > 0 and len(l_p[0]) > 0: plt.scatter(l_p[:, 0], l_p[:, 2], c=self.colors[label_idx], alpha=0.8, s=8) plt.show() '''remove the points with low confidence, discard the result entirely if the confidence is too low. Transform back the leftover points''' if len(leftover_points ) > 0: # and 1 / best_conf <= kNN_rnd_dist + t*kNN_rnd_std: add_me = trafo.transform_back(leftover_points) self.added_points = np.concatenate((self.added_points, add_me)) self.added_labels = np.append(self.added_labels, [label] * len(add_me)) label_confidence.append(best_conf) if point_plots: plt.show() return label_confidence