from helper import Utilities, PerformanceEvaluation
import pandas as pd
import numpy as np
from metric_learning import MetricLearning, Subsampling
from user_feedback import Similarity
import pickle
from scipy.misc import comb
import time
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published energy database to maximally retain the information about peak-time energy usage
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()
mel = MetricLearning()

# step 1: get the database to be published
day_profile = pd.read_pickle('../dataset/dataframe_all_energy.pkl')
day_profile = day_profile.iloc[
    0:90, 0::
    4]  # subsample the database to improve the speed for demonstration purpose
day_profile.index = range(len(day_profile.index))
rep_mode = 'mean'
anonymity_level = 2  # desired anonymity level

# step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
# information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
# starting and ending time of the peak usage time
interest = 'window-usage'
Esempio n. 2
0
import pandas as pd
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
from subsampling import Subsampling
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()


def evaluation_occupancy_statistics(n, mode="arrival"):
    day_profile1 = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
    day_profile1 = day_profile1.fillna(0)
    day_profile1[day_profile1 > 0] = 1

    res = 15

    day_profile = day_profile1.iloc[:90, 0::
                                    res]  # subsample the database to improve the speed for demonstration purpose
    day_profile2 = day_profile1.iloc[
        90:-1, 0::
        res]  # subsample the database to improve the speed for demonstration purpose
Esempio n. 3
0
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
from subsampling import Subsampling

"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()

def evaluation_occupancy_statistics(n, df_subsampled_from,day_profile):

    anonymity_level = n
    mode = "arrival"
    rep_mode = 'mean'
    subsample_size_max = int(comb(len(df_subsampled_from),2))

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(subsample_size=subsample_size, seed = None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
Esempio n. 4
0
def evaluation_total_usage(n):
    """
    In the demo, we will showcase an example of special purpose publication.
    The data user wants the published energy database to maximally retain the information about peak-time energy usage
    """

    # Initialization of some useful classes
    util = Utilities()
    pe = PerformanceEvaluation()

    # step 1: get the database to be published
    day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl')
    day_profile = day_profile.fillna(0)
    day_profile = day_profile.iloc[
        0:90, 0::
        4]  # subsample the database to improve the speed for demonstration purpose
    day_profile.index = range(len(day_profile.index))
    rep_mode = 'mean'
    anonymity_level = n  # desired anonymity level

    # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
    # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the
    # starting and ending time of the peak usage time
    interest = 'window-usage'
    window = [17, 21]

    sanitized_profile_best = util.sanitize_data(
        day_profile,
        distance_metric='self-defined',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        mode=interest,
        window=window)

    # step 3: pre-sanitize the database
    sanitized_profile_baseline = util.sanitize_data(
        day_profile,
        distance_metric='euclidean',
        anonymity_level=anonymity_level,
        rep_mode=rep_mode)

    loss_best_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_best,
        mode=interest,
        window=window)

    loss_generic_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_baseline,
        mode=interest,
        window=window)
    # print("information loss with learned metric %s" % loss_generic_metric)

    df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample(
        frac=1)
    subsample_size_max = int(comb(len(df_subsampled_from), 2))

    print('total number of pairs is %s' % subsample_size_max)

    # step 4: sample a subset of pre-sanitized database and form the data points into pairs
    subsample_size = int(round(subsample_size_max))
    sp = Subsampling(data=df_subsampled_from)
    data_pair, data_pair_all_index = sp.uniform_sampling(
        subsample_size=subsample_size, seed=None)

    # User receives the data pairs and label the similarity
    sim = Similarity(data=data_pair)
    sim.extract_interested_attribute(interest='statistics',
                                     stat_type=interest,
                                     window=window)
    similarity_label, data_subsample = sim.label_via_silhouette_analysis(
        range_n_clusters=range(2, 8))

    # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs
    lm = Linear_Metric()
    lm.train(data_pair, similarity_label)

    dm = Deep_Metric()
    dm.train(data_pair, similarity_label)

    # step 6: the original database is privatized using the learned metric
    sanitized_profile_deep = util.sanitize_data(
        day_profile,
        distance_metric="deep",
        anonymity_level=anonymity_level,
        rep_mode=rep_mode,
        deep_model=dm,
        window=window)

    sanitized_profile = util.sanitize_data(day_profile,
                                           distance_metric="deep",
                                           anonymity_level=anonymity_level,
                                           rep_mode=rep_mode,
                                           deep_model=lm,
                                           window=window)

    # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database
    loss_learned_metric_deep = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile_deep.round(),
        mode=interest,
        window=window)

    loss_learned_metric = pe.get_statistics_loss(
        data_gt=day_profile,
        data_sanitized=sanitized_profile,
        mode=interest,
        window=window)

    print('anonymity level %s' % anonymity_level)
    print("sampled size %s" % subsample_size)
    print("information loss with best metric %s" % loss_best_metric)
    print("information loss with generic metric %s" % loss_generic_metric)
    print("information loss with learned metric %s" % loss_learned_metric)
    print("information loss with learned metric deep  %s" %
          (loss_learned_metric_deep))
    return (sanitized_profile_best, sanitized_profile_baseline,
            sanitized_profile,
            sanitized_profile_deep), (loss_best_metric, loss_generic_metric,
                                      loss_learned_metric,
                                      loss_learned_metric_deep), subsample_size
Esempio n. 5
0
from helper import Utilities, PerformanceEvaluation
import pandas as pd
from metric_learning import Subsampling, MetricLearning
from user_feedback import Similarity
from scipy.misc import comb
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()
mel = MetricLearning()

# step 1: get the database to be published
day_profile = pd.read_pickle('../dataset/dataframe_all_binary.pkl')
day_profile = day_profile.iloc[0::4, 0::60]
rep_mode = 'mean'
anonymity_level = 2  # desired anonymity level

# step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
# information of a segment of entire time series. In this case, he/she would also need to specify the starting and
# ending time of the time series segment of interest.
interest = 'segment'
window = [
    11, 15
]  # window specifies the starting and ending time of the period that the data user is interested in

# step 3: pre-sanitize the database
sanitized_profile_baseline = util.sanitize_data(
Esempio n. 6
0
import pandas as pd
from subsampling import Subsampling
from user_feedback import Similarity
from scipy.misc import comb
from deep_metric_learning import Deep_Metric
import numpy as np
import pickle
from linear_metric_learning import Linear_Metric
"""
In the demo, we will showcase an example of special purpose publication.
The data user wants the published database to maximally retain the information about lunch time.
"""

# Initialization of some useful classes
util = Utilities()
pe = PerformanceEvaluation()

day_profile_all = pd.read_pickle('./dataset/dataframe_all_binary.pkl')
day_profile = day_profile_all.iloc[0:90, 0::60]  # the database to be published
day_profile_metric_learn = day_profile_all.iloc[
    90:-1, 0::60]  # the database for learning distance metric
day_profile.dropna()
day_profile_metric_learn.dropna()

rep_mode = 'mean'
anonymity_level = 2  # desired anonymity level

# step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the
# information of a segment of entire time series. In this case, he/she would also need to specify the starting and
# ending time of the time series segment of interest.
interest = 'segment'