Beispiel #1
0
def main():
    data = pd.read_csv("../data/data.csv") 
    
    male_age,male_height = get_male_data(data)
    female_age,female_height = get_female_data(data)
    
    plt = create_histogram(male_age, female_age,'Age')
    plt.show()
    
    plt = create_histogram(male_height, female_height,'Height')
    plt.show()
Beispiel #2
0
    def set_optimal_threshold(self, x, add_syn=True, num_samples=100, n_dim=5):
        errors = self.errors(x)

        if add_syn:
            syn_errors = self.errors(
                self.synthesize(x, num_samples=num_samples, n_dim=n_dim))

            errors = np.concatenate((errors, syn_errors), axis=0)

        # Calculate the number of bins according to Freedman-Diaconis rule
        bin_width = 2 * iqr(errors) / np.power(len(errors), (1 / 3))
        num_bins = (np.max(errors) - np.min(errors)) / bin_width

        hist, bins = create_histogram(errors,
                                      num_bins=num_bins,
                                      step=bin_width)
        occurences = [float(o) for o in hist.tolist()]

        breaks = htb(hist)

        possible_thresholds = []

        for b in breaks:
            t = fetch_threshold(bins, hist, b)
            possible_thresholds.append(t)

            self.optimal_threshold = max(possible_thresholds)

        return self.optimal_threshold
Beispiel #3
0
  def set_optimal_threshold(self, x):
    errors = self.errors(x)

    # Calculate the number of bins according to Freedman-Diaconis rule
    bin_width = 2 * iqr(errors) / np.power(len(errors), (1/3))
    num_bins  = (np.max(errors) - np.min(errors)) / bin_width

    hist, bins = create_histogram(errors, num_bins=num_bins, step=bin_width)
    occurences = [float(o) for o in hist.tolist()]

    breaks = htb(hist)

    possible_thresholds = []

    for b in breaks:
      t = fetch_threshold(bins, hist, b)
      possible_thresholds.append(t)

      self.optimal_threshold = max(possible_thresholds)

    return self.optimal_threshold
Beispiel #4
0
    def build(self, data):
        self.original_data = data
        self.encoded_data = self.autoencoder.encode(data)
        self.reconstructed_data = self.autoencoder.decode(self.encoded_data)

        self.df_encoded_data = pd.DataFrame(data=self.encoded_data)
        self.df_encoded_data_mean = self.df_encoded_data.mean(axis=0)
        self.df_encoded_data_std = self.df_encoded_data.std(axis=0)

        self.stochastic_dimensions = random.sample(
            range(len(self.df_encoded_data.columns)), self.s_dimension_count)
        self.num_to_synthesize = round(len(data) * self.relative_frequency)

        if self.num_to_synthesize > 0:
            self.synthetic_latent_data = ((self.df_encoded_data.sample(
                self.num_to_synthesize)).reset_index(drop=True)).copy()
        else:
            self.synthetic_latent_data = self.df_encoded_data

        if self.n_samples > 0:
            for index, row in self.synthetic_latent_data.iterrows():
                for d in self.stochastic_dimensions:
                    tail_values = self.sample_tails(
                        self.df_encoded_data_mean.values[d],
                        self.df_encoded_data_std.values[d], self.n_samples)

                    if len(tail_values) == 0:
                        outlier_v = self.synthetic_latent_data.at[index, d]
                    else:
                        outlier_v = random.choice(tail_values)

                    self.synthetic_latent_data.at[index, d] = outlier_v

        # Reconstruct using frozen weights
        self.synthetic_data = self.autoencoder.decode(
            self.synthetic_latent_data.values)
        self.df_synthetic_data = pd.DataFrame(data=self.synthetic_data)

        self.synthetic_data_with_labels = np.append(
            self.synthetic_data,
            np.ones((len(self.synthetic_data), 1)),
            axis=1)
        self.reconstructed_data_with_labels = np.append(
            self.reconstructed_data,
            np.zeros((len(self.reconstructed_data), 1)),
            axis=1)

        # Reconstructed synthetic data
        self.reconstructed_synthetic = self.autoencoder.predict(
            self.synthetic_data)

        self.X = np.concatenate((self.original_data, self.synthetic_data))
        self.Y = np.concatenate(
            (self.reconstructed_data, self.reconstructed_synthetic))
        self.errors = np.power(self.X - self.Y, 2)
        self.mean_sq_errors = np.mean(self.errors, axis=1)

        # Calculate the number of bins according to Freedman-Diaconis rule
        bin_width = 2 * iqr(self.mean_sq_errors) / np.power(
            len(self.errors), (1 / 3))
        num_bins = (np.max(self.mean_sq_errors) -
                    np.min(self.mean_sq_errors)) / bin_width

        self.hist, self.bins = create_histogram(self.mean_sq_errors,
                                                num_bins=num_bins,
                                                step=bin_width)
        self.occurences = [float(x) for x in self.hist.tolist()
                           ]  # Convert to float data type

        breaks = htb(self.hist)
        self.possible_thresholds = []

        for b in breaks:
            t = fetch_threshold(self.bins, self.hist, b)
            self.possible_thresholds.append(t)

        self.optimal_threshold = max(self.possible_thresholds)

        # Create labels for histogram rendering
        self.labels = []
        for i in range(len(self.bins) - 1):
            self.labels.append(
                str(round(self.bins[i], 4)) + "-" +
                str(round(self.bins[i + 1], 4)))