def __init__(self, test=False, timit_version=None): co = Constants() self.u = Utilities() if timit_version: timit_version = timit_version.lower() if timit_version not in ['timit', 'ffmtimit']: raise ValueError('Bad timit version') if timit_version == 'timit': if test: self.SET_ROOT = co.TIMIT_TEST_ROOT else: self.SET_ROOT = co.TIMIT_TRAIN_ROOT elif timit_version == 'ffmtimit': if test: self.SET_ROOT = co.FFMTIMIT_TEST_ROOT else: self.SET_ROOT = co.FFMTIMIT_TRAIN_ROOT
from helper import Utilities, PerformanceEvaluation import pandas as pd import numpy as np from metric_learning import MetricLearning, Subsampling from user_feedback import Similarity import pickle from scipy.misc import comb import time """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() mel = MetricLearning() # step 1: get the database to be published day_profile = pd.read_pickle('../dataset/dataframe_all_energy.pkl') day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = 2 # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage'
def evaluation_total_usage(n): """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() # step 1: get the database to be published day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl') day_profile = day_profile.fillna(0) day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = n # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage' window = [17, 21] sanitized_profile_best = util.sanitize_data( day_profile, distance_metric='self-defined', anonymity_level=anonymity_level, rep_mode=rep_mode, mode=interest, window=window) # step 3: pre-sanitize the database sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_best_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=interest, window=window) loss_generic_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline, mode=interest, window=window) # print("information loss with learned metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm, window=window) sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm, window=window) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=interest, window=window) loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile, mode=interest, window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def __init__(self, set_name): self.SET_NAME = set_name self.co = Constants() self.io = DataFiles() self.u = Utilities()
def __init__(self): self.co = Constants() self.u = Utilities()