Ejemplo n.º 1
0
    def fill_up(self,
                num_bins,
                iterations=10,
                fill_up_plots=False,
                point_plots=False,
                RO=True,
                t=1):

        # consider every label seperately
        label_confidence = []
        for label in self.D.labels:
            label_idx = self.D.labels.index(label)
            '''collect training data'''
            data = self.D.X_b_train[self.D.Y_b_train == label]
            '''remove outliers, rotate data'''
            if RO:
                data = Transformations.remove_outliers_lof(data)
            trafo = self.trafo()
            data = trafo.transform(data)

            cdfs_scaled = np.empty((len(data[0]), num_bins))
            fitted_cdf = np.empty((len(data[0]), num_bins))
            fitted_ = np.empty((len(data[0]), num_bins))
            num_fill_up = 0
            data_range = []

            DE_list = []

            if fill_up_plots:
                f, ax = plt.subplots(nrows=1,
                                     ncols=len(data[0]),
                                     figsize=(6, 2.5))

            # consider every dimension
            for line in range(len(data[0])):
                '''project onto line, determine borders'''
                d = data[:, line]
                d_min = min(d)
                d_max = max(d)
                data_range.append([d_min, d_max])
                '''define Density Estimator here!'''
                DE_list.append(self.DE(num_bins))
                DE_list[line].estimate(d, d_min, d_max)
                '''estimate distribution'''
                fitted = self.density_func.fit(DE_list[line].mids,
                                               DE_list[line].values, d)
                fitted_[line] = copy.deepcopy(fitted)
                fitted_cdf[line] = np.cumsum(fitted)
                fitted_cdf[line] = fitted_cdf[line] / fitted_cdf[line][-1]
                '''to be filled up: the differences between the distribution curve and the histogram'''
                diff = fitted - DE_list[line].values
                '''number of points to add'''
                num_points_line = (len(d) /
                                   sum(DE_list[line].values)) * sum(diff)
                num_fill_up = max(num_fill_up, num_points_line)
                '''probability distribution for the fill-up'''
                if sum(diff) == 0:
                    cdfs_scaled[line] = [0] * num_bins
                else:
                    diff = diff / sum(diff)
                    diff = [max(diff[i], 0) for i in range(len(diff))]
                    cdfs_scaled[line] = np.cumsum(diff)
                    cdfs_scaled[line] = (
                        cdfs_scaled[line] /
                        cdfs_scaled[line][-1]) * num_points_line

                if fill_up_plots:
                    barWidth = DE_list[line].mids[1] - DE_list[line].mids[0]
                    fill = fitted_[line] - DE_list[line].values
                    ax[line].bar(DE_list[line].mids,
                                 DE_list[line].values,
                                 label='data',
                                 color='teal',
                                 width=barWidth)
                    ax[line].bar(DE_list[line].mids,
                                 [max(fill[i], 0) for i in range(len(fill))],
                                 bottom=DE_list[line].values,
                                 label='fill up',
                                 color='goldenrod',
                                 width=barWidth,
                                 hatch="...",
                                 edgecolor="white")
                    ax[line].plot(DE_list[line].mids,
                                  fitted_[line],
                                  label='fitted',
                                  c='mediumvioletred',
                                  linewidth=2)
                    ax[line].get_xaxis().set_ticks([])
                    ax[line].get_yaxis().set_ticks([])

            if fill_up_plots:
                ax[-1].legend()
                plt.show()
                # f.savefig('Results/Example_cluster_distr.pdf', format='pdf', dpi=1200, bbox_inches='tight')

            # determine the number of added points in total: max over dimensions
            num_fill_up = int(num_fill_up)
            if num_fill_up == 0:
                label_confidence.append(0)
                continue

            # best out of 10: go for the result with the highest confidence
            best_conf = 0
            leftover_points = []
            # kNN_rnd_dist, kNN_rnd_std = confidence_kNN_rnd_coeff(data_range, num_fill_up)
            kNN_rnd_dist, kNN_rnd_std = Confidence.confidence_kNN_train_sized_coeff(
                data, num_fill_up)
            for it in range(iterations):
                points = np.empty((num_fill_up, 0))

                # generate points
                for line in range(len(data[0])):
                    '''adjust cdf (in case there have to be more points added because of other lines)'''
                    distr_scaled = fitted_cdf[line] * max(
                        (num_fill_up - cdfs_scaled[line][-1]), 0)
                    cdf = cdfs_scaled[line] + distr_scaled
                    cdf = cdf / cdf[-1]  # normalize
                    '''generate random values according to the cdf'''
                    values = np.random.rand(num_fill_up)
                    value_bins = np.searchsorted(cdf, values)
                    coords = np.array([
                        random.uniform(DE_list[line].grid[value_bins[i]],
                                       DE_list[line].grid[value_bins[i] + 1])
                        for i in range(num_fill_up)
                    ]).reshape(num_fill_up, 1)
                    points = np.concatenate((points, coords), axis=1)
                '''compute the confidence of the result'''
                if len(points) < 20:
                    conf_b, conf_a, l_p = (0, 0, [[]])
                else:
                    conf_b, conf_a, l_p = Confidence.confidence_kNN_rnd(
                        points, kNN_rnd_dist, t * kNN_rnd_std)

                # add the points to the data set
                if conf_a > best_conf:
                    best_conf = conf_a
                    leftover_points = copy.deepcopy(l_p)
                    # leftover_points = points

                if point_plots:
                    plt.figure(it)
                    plt.scatter(data[:, 0],
                                data[:, 1],
                                c=self.colors[label_idx],
                                alpha=0.2,
                                s=3)
                    plt.scatter(points[:, 0],
                                points[:, 1],
                                c='red',
                                alpha=0.8,
                                s=8)
                    if len(l_p) > 0 and len(l_p[0]) > 0:
                        plt.scatter(l_p[:, 0],
                                    l_p[:, 1],
                                    c=self.colors[label_idx],
                                    alpha=0.8,
                                    s=8)
                    plt.show()
                    if len(data[0]) > 2:
                        plt.figure(it * 100)
                        plt.scatter(data[:, 0],
                                    data[:, 2],
                                    c=self.colors[label_idx],
                                    alpha=0.2,
                                    s=3)
                        plt.scatter(points[:, 0],
                                    points[:, 2],
                                    c='red',
                                    alpha=0.8,
                                    s=8)
                        if len(l_p) > 0 and len(l_p[0]) > 0:
                            plt.scatter(l_p[:, 0],
                                        l_p[:, 2],
                                        c=self.colors[label_idx],
                                        alpha=0.8,
                                        s=8)
                        plt.show()
            '''remove the points with low confidence, discard the result entirely 
               if the confidence is too low. Transform back the leftover points'''
            if len(leftover_points
                   ) > 0:  # and 1 / best_conf <= kNN_rnd_dist + t*kNN_rnd_std:
                add_me = trafo.transform_back(leftover_points)
                self.added_points = np.concatenate((self.added_points, add_me))
                self.added_labels = np.append(self.added_labels,
                                              [label] * len(add_me))

            label_confidence.append(best_conf)
        if point_plots:
            plt.show()

        return label_confidence