Exemple #1
0
    def __init__(self,
                 first: pd.DataFrame,
                 second: pd.DataFrame,
                 categories=None):
        """
        BiFrame class that contains two data sets, which currently provides
        kinds of analysis methods from distribution, correlation, and some
        machine learning tasks.
        Especially, if the input data sets are source and synthesized dataset,
        this class can be used to evaluate the utility and privacy of
        synthesized data set.

        Parameters
        ----------
        first : {pandas.DataFrame}
            first data set (i.e. original dataset)

        second : {pandas.DataFrame}
            second data set (i.e. synthesized dataset)

        categories : list of columns
            Column names whose values are categorical.
        """
        # To compare two data sets, make sure that they have same columns. If
        # not, compare them on their common columns.
        cols = set(first.columns) & set(second.columns)
        if len(cols) != len(first.columns) or len(cols) != len(second.columns):
            warnings.warn("Evaluate on partial columns of the datasets",
                          stacklevel=2)

        categories = [] if categories is None else categories
        self.fst = DataSet(first[cols], categories=categories)
        self.snd = DataSet(second[cols], categories=categories)

        # Make sure that two dataset have same domain for categorical
        # attributes, and same min, max values for numerical attributes.
        for col in cols.copy():
            # If current column is not categorical, will ignore it.
            if not self.fst[col].categorical or not self.snd[col].categorical:
                continue
            fst_domain, snd_domain = self.fst[col].domain, self.snd[col].domain
            if not np.array_equal(fst_domain, snd_domain):
                # if there is no intersection of two domains, then there may be
                # zero relationship between the columns.
                if len(np.intersect1d(fst_domain, snd_domain)) == 0:
                    self.fst = self.fst.drop(col, axis=1)
                    self.snd = self.snd.drop(col, axis=1)
                    cols.remove(col)
                    continue
                if self.fst[col].categorical:
                    domain = np.unique(np.concatenate(
                        (fst_domain, snd_domain)))
                else:
                    domain = [
                        min(fst_domain[0], snd_domain[0]),
                        max(fst_domain[1], snd_domain[1])
                    ]
                self.fst[col].domain = domain
                self.snd[col].domain = domain
        self._columns = sorted(cols)
def test_encode_empty_column():
    from numpy import array_equal
    data = [[1001, 'A', 'Female'], [1002, 'B', 'Male'], [1003, 'C', 'Male'],
            [1004, 'D', 'Female'], [1005, 'E', 'Female']]
    ds = DataSet(data, columns=['ID', 'Name', 'Sex'])
    x = DataFrame(data[-2:], columns=['ID', 'Name', 'Sex'])
    x_tf = ds.encode(data=x)
    # Name is not categorical, because it has unique values
    assert x_tf.shape == (2, 3)
    assert array_equal(x_tf.columns, ['ID', 'Sex_Female', 'Sex_Male'])
def test_encode_partly():
    from .testdata import adults01
    from sklearn.model_selection import train_test_split
    dataset = DataSet(adults01)
    train, test = train_test_split(adults01, test_size=0.2)
    frame = dataset.encode(data=train)
    assert 'salary_<=50K' in frame.columns
    assert 'salary_>50K' in frame.columns
    assert ((0 == frame['salary_<=50K']) | (frame['salary_<=50K'] == 1)).all()
    assert ((0.0 <= frame['age']) & (frame['age'] <= 1.0)).all()
Exemple #4
0
    def __init__(self,
                 first: pd.DataFrame,
                 second: pd.DataFrame,
                 categories=None):
        """
        BiFrame class that contains two data sets, which currently provides
        kinds of analysis methods from distribution, correlation, and some
        machine learning tasks.
        Especially, if the input data sets are source and synthesized dataset,
        this class can be used to evaluate the utility and privacy of
        synthesized data set.

        Parameters
        ----------
        first : {pandas.DataFrame}
            first data set (i.e. original dataset)

        second : {pandas.DataFrame}
            second data set (i.e. synthesized dataset)

        categories : list of columns
            Column names whose values are categorical.
        """
        # distribution
        self._dt = {}

        # To compare two data set, make sure that they have same columns.
        # If not, compare the common part.
        common = set(first.columns) & set(second.columns)
        if len(common) != len(first.columns) or len(common) != len(
                second.columns):
            logger.info(f"BiFrame constructed on attributes: {common}.")

        # left and right data set (ds)
        self.first = DataSet(first[common], categories=categories)
        self.second = DataSet(second[common], categories=categories)
        self._columns = self.first.columns.sort_values().to_list()

        # Make sure that two dataset have same domain for categorical
        # attributes, and same min, max values for numerical attributes.
        for col in self._columns:
            # If current column is not categorical, will ignore it.
            if not self.first[col].categorical or not self.second[
                    col].categorical:
                continue
            d1, d2 = self.first[col].domain, self.second[col].domain
            if not np.array_equal(d1, d2):
                if self.first[col].categorical:
                    domain = np.unique(np.concatenate((d1, d2)))
                else:
                    domain = [min(d1[0], d2[0]), max(d1[1], d2[1])]
                self.first[col].domain = domain
                self.second[col].domain = domain
def test_encode():
    from .testdata import adults01
    from numpy import array_equal
    dataset = DataSet(adults01)
    frame = dataset.encode()
    for col in ['education', 'relationship', 'salary']:
        assert col not in frame.columns
    for col in ['age', 'birth']:
        assert col in frame.columns

    assert 'salary_<=50K' in frame.columns
    assert 'salary_>50K' in frame.columns

    for attr, val in [('salary', '<=50K'), ('relationship', 'Wife'),
                      ('relationship', 'Husband')]:
        trans_col = frame[f'{attr}_{val}'].apply(lambda v: v == 1)
        origin_col = adults01[attr] == val
        assert array_equal(trans_col, origin_col)
def test_split_feature_class():
    frame = DataSet(adults01[['age', 'relationship',
                              'salary']].head(10)).encode()
    features1, class1 = split_feature_class('birth', frame)
    assert features1.equals(frame)
    assert class1 is None

    features2, class2 = split_feature_class('age', frame)
    assert features2.equals(frame)
    assert class2 is None

    features3, class3 = split_feature_class('salary', frame)
    assert len(features3.columns) == 4
    assert class3.name == 'salary_>50K'

    features4, class4 = split_feature_class('relationship', frame)
    assert len(features4.columns) == 3
    assert class4.min() == 0
    assert class4.max() == 2
def test_svm_task():
    from sklearn.svm import SVC
    from sklearn.model_selection import train_test_split
    from .testdata import adults01
    c_df = DataFrame(adults01)
    c_tf = DataSet(c_df).encode()
    train, test = train_test_split(c_tf, test_size=0.2)

    def make_train_x_y(df):
        x_ = df.drop(['salary_<=50K', 'salary_>50K'], axis=1)
        # <=50K and >50K are binary, complementary
        _, ym_ = df['salary_<=50K'], df['salary_>50K']
        return x_, ym_

    tr_x, tr_y = make_train_x_y(train)
    te_x, te_y = make_train_x_y(test)
    clf = SVC(gamma='scale')
    clf.fit(tr_x, tr_y)
    pr_y = clf.predict(te_x)
    from sklearn.metrics import confusion_matrix, classification_report
    print(confusion_matrix(te_y, pr_y))
    print(classification_report(te_y, pr_y))
def test_synthesize_for_privacy():
    # Verify probability after synthesis by differential privacy. (This test
    # case may fail because of limit runs.)
    from numpy.random import randint
    from numpy import exp
    epsilon = 0.1
    runs = 200
    data = randint(65, 90, size=(199, 2))
    set1 = DataSet(data.tolist() + [[65, 65]], columns=['ColA', 'ColB'])
    set2 = DataSet(data.tolist() + [[65, 66]], columns=['ColA', 'ColB'])
    counts = [0, 0]
    for i in range(runs):
        df1 = set1.synthesize(epsilon=epsilon)
        df2 = set2.synthesize(epsilon=epsilon)
        counts[0] += ((df1['ColA'] == 65) & (df1['ColB'] == 65)).sum()
        counts[1] += ((df2['ColA'] == 65) & (df2['ColB'] == 66)).sum()
    assert counts[0] / (runs * 200) <= exp(epsilon) * counts[1] / (runs * 200)
def test_synthesize_with_retains():
    dataset = DataSet(adults01)
    df = dataset.synthesize(retains=['age'])
    assert df.size == dataset.size
    assert array_equal(dataset['age'], df['age'])
def test_synthesize_with_pseudonyms():
    dataset = DataSet(adults01)
    df = dataset.synthesize(pseudonyms=['salary'])
    assert df.size == dataset.size
    assert array_equal(dataset['salary'].value_counts().values,
                       df['salary'].value_counts().values)
def test_synthesize():
    dataset = DataSet(adults01)
    df = dataset.synthesize()
    assert df.size == dataset.size
Exemple #12
0
class BiFrame(object):
    def __init__(self,
                 first: pd.DataFrame,
                 second: pd.DataFrame,
                 categories=None):
        """
        BiFrame class that contains two data sets, which currently provides
        kinds of analysis methods from distribution, correlation, and some
        machine learning tasks.
        Especially, if the input data sets are source and synthesized dataset,
        this class can be used to evaluate the utility and privacy of
        synthesized data set.

        Parameters
        ----------
        first : {pandas.DataFrame}
            first data set (i.e. original dataset)

        second : {pandas.DataFrame}
            second data set (i.e. synthesized dataset)

        categories : list of columns
            Column names whose values are categorical.
        """
        # distribution
        self._dt = {}

        # To compare two data set, make sure that they have same columns.
        # If not, compare the common part.
        common = set(first.columns) & set(second.columns)
        if len(common) != len(first.columns) or len(common) != len(
                second.columns):
            logger.info(f"BiFrame constructed on attributes: {common}.")

        # left and right data set (ds)
        self.first = DataSet(first[common], categories=categories)
        self.second = DataSet(second[common], categories=categories)
        self._columns = self.first.columns.sort_values().to_list()

        # Make sure that two dataset have same domain for categorical
        # attributes, and same min, max values for numerical attributes.
        for col in self._columns:
            # If current column is not categorical, will ignore it.
            if not self.first[col].categorical or not self.second[
                    col].categorical:
                continue
            d1, d2 = self.first[col].domain, self.second[col].domain
            if not np.array_equal(d1, d2):
                if self.first[col].categorical:
                    domain = np.unique(np.concatenate((d1, d2)))
                else:
                    domain = [min(d1[0], d2[0]), max(d1[1], d2[1])]
                self.first[col].domain = domain
                self.second[col].domain = domain

    @property
    def columns(self):
        return self._columns

    def err(self):
        """
        Return pairwise err (relative error) of columns' distribution.
        """
        # merge two frequency counts, and calculate relative difference
        df = pd.DataFrame(columns=self._columns, index=['err'])
        df.fillna(0)
        for col in self._columns:
            df.at['err', col] = relative_error(self.first[col].counts(),
                                               self.second[col].counts())
        return df

    def jsd(self):
        """
        Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution.
        """
        df = pd.DataFrame(columns=self._columns, index=['jsd'])
        df.fillna(0)
        for col in self._columns:
            df.at['jsd',
                  col] = jensen_shannon_divergence(self.first[col].counts(),
                                                   self.second[col].counts())
        return df

    def corr(self):
        """
        Return pairwise correlation and dependence measured by mi (mutual
        information).
        """
        return self.first.mi(), self.second.mi()

    def dist(self, column):
        """
        Return frequency distribution of one column.

        Parameters
        ----------
        column : str
            column name, whose distribution will be return
        """
        if len(self._dt) == 0:
            for c in self._columns:
                self._dt[c] = {}
                if self.first[c].categorical:
                    bins = self.first[c].domain
                    counts1 = self.first[c].counts(bins=bins)
                    counts2 = self.second[c].counts(bins=bins)
                else:
                    min_, max_ = self.first[c].domain
                    # the domain from two data set are same;
                    # extend the domain to human-readable range
                    bins = normalize_range(min_, max_ + 1)
                    counts1 = self.first[c].counts(bins=bins)
                    counts2 = self.second[c].counts(bins=bins)
                    # Note: index, value of np.histogram has different length
                    bins = bins[:-1]
                self._dt[c]['bins'] = bins
                # stack arrays vertically
                self._dt[c]['counts'] = np.vstack((counts1, counts2))
        return self._dt[column]['bins'], self._dt[column]['counts']

    def describe(self):
        """
        Give descriptive difference between two data sets, which concluded
        relative errors, and jsd divergence.
        Return a panda.DataFrame, whose columns are two dataset's columns, and
        indexes are a array of metrics, e.g. ['err', 'jsd'].
        """
        df1 = self.err()
        df2 = self.jsd()
        return pd.concat([df1, df2])

    def classify(self, label: str, test: pd.DataFrame = None):
        """
        Train two svm classifiers based on data sets, and predict class labels
        for test data. Return both error rates.

        Parameters
        ----------
        label : str
            classifier feature, key is one column in left data frame.
            It supports two-class and multi-class.

        test : {pandas.DataFrame}
            test frame, is test data for machine learning algorithms. If it is
            not provided, it will split 20% of left data frame as test data.

        Returns
        -------
        a DataFrame, e.g.
                         target                         source     target
                      male female                    male female male female
        source male   1    3        or actual male   1    3      1    2
               female 2    4                  female 2    4      3    4
        """
        if (not self.first[label].categorical
                or not self.second[label].categorical):
            raise ValueError(f'Classifier can not run on non-categorical '
                             f'column: {label}')
        from sklearn.metrics import confusion_matrix

        def split_feature_label(df: pd.DataFrame):
            # TODO need improve sub_cols
            sub_cols = [attr for attr in df.columns if attr.startswith(label)]
            if len(sub_cols) == 0:
                return df, None
            is_one_class = len(sub_cols) == 2
            if is_one_class:
                # For one class, there are two sorted values.
                # e.g. ['Yes', 'No'] => [[0, 1],
                #                        [1, 0]]
                # Choose second column to represent this attribute.
                label_ = sub_cols[1]
                return df.drop(sub_cols, axis=1), df[label_]
            else:
                try:
                    # merge multiple columns into one column:
                    # [Name_A, Name_B, ..] => Name
                    _y = df[sub_cols].apply(lambda x: Index(x).get_loc(1),
                                            axis=1)
                    return df.drop(sub_cols, axis=1), _y
                except KeyError as e:
                    print(e)
                    print(sub_cols)
                    print(df[sub_cols])

        # If test dataset is not provided, then split 20% of original dataset
        # for testing.
        if test is None:
            fst_train, test = train_test_split(self.first, test_size=0.2)
            snd_train, _ = train_test_split(self.second, test_size=0.2)
        else:
            fst_train = self.first
            snd_train = self.second
        # ts = self.first.encode(data=fst_train)
        fst_train_x, fst_train_y = split_feature_label(
            self.first.encode(data=fst_train))
        test_x, test_y = split_feature_label(self.first.encode(data=test))
        snd_train_x, snd_train_y = split_feature_label(
            self.first.encode(data=snd_train))

        # construct svm classifier, and predict on the same test dataset
        fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x)
        snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x)

        columns = self.first[label].bins
        labels = range(len(columns))
        # If test dataset has the columns as class label for prediction, return
        # two expected scores: (self.first) original dataset's and (self.second)
        # anonymized dataset's confusion matrix.
        if label in test:
            fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels)
            snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels)
            # normalize the confusion matrix
            # fst_matrix = fst_matrix.astype('float') / fst_matrix.sum(axis=1)
            # snd_matrix = snd_matrix.astype('float') / snd_matrix.sum(axis=1)
            return (pd.DataFrame(fst_matrix, columns=columns, index=columns),
                    pd.DataFrame(snd_matrix, columns=columns, index=columns))
        # If test dataset does not have the class label for prediction, return
        # their predicted values.
        else:
            matrix = confusion_matrix(fst_predict_y,
                                      snd_predict_y,
                                      labels=labels)
            return pd.DataFrame(matrix, columns=columns, index=columns)

    def to_html(self,
                buf=None,
                title='Evaluation Report',
                info=True,
                distribute=True,
                correlate=True,
                classifier=None,
                labels=None,
                test=None):
        """
        Render the evaluation result of two data set as an HTML file.

        Parameters
        ----------
        buf : optional
            buffer to write to

        title : str
            title of evaluation report

        info : bool, default true
            show basic information of two data set, including relative error,
            and Jensen-Shannon divergence (jsd).

        distribute : bool, default true
            show distribution of each attribute.

        correlate : bool, default true
            show correlation of pair-wise attributes.

        classifier : str
            use classifier to train data set on one or more columns (defined by
            parameter 'label') and show prediction result on the evaluation
            report. Optional classifier: SVM.

        labels : list of column names
            column name, or a list of column names separated by comma, used for
            classification task.

        test : pd.DataFrame
            test data for classification, and other machine learning tasks.
        """
        from ds4ml.utils import (plot_histogram, plot_heatmap,
                                 plot_confusion_matrix)
        from mako.template import Template
        import os
        old_cwd = os.getcwd()
        os.chdir(os.path.dirname(__file__))
        template = Template(filename='template/report.html')
        os.chdir(old_cwd)

        topics = []
        content = {}
        # format different kinds of evaluation result to unified style
        if info:
            topics.append('basic')
            content['basic'] = [self.describe().to_dict('split')]

        if distribute:
            topics.append('dist')
            content['dist'] = []
            for col in self.columns:
                bins, counts = self.dist(col)
                svg = plot_histogram(bins, counts)
                content['dist'].append({
                    'name': col,
                    'columns': bins,
                    'data': counts,
                    'path': svg
                })

        if correlate:
            topics.append('corr')
            content['corr'] = []
            source_mi, target_mi = self.corr()
            source_svg = plot_heatmap(source_mi)
            target_svg = plot_heatmap(target_mi)
            content['corr'].append({
                'matrix': source_mi.to_dict('split'),
                'path': source_svg
            })
            content['corr'].append({
                'matrix': target_mi.to_dict('split'),
                'path': target_svg
            })

        if labels is not None:
            topics.append('svm')
            content['svm'] = []
            for col in labels:
                in_test = (test is not None and col in test) or (test is None)
                if in_test:
                    # When class label in svm classify test data, try to match
                    # two predicted result with the actual data, and so, there
                    # will be two confusion matrix diagrams.
                    try:
                        source_cm, target_cm = self.classify(col, test=test)
                        vrange = (min(source_cm.values.min(),
                                      target_cm.values.min()),
                                  max(source_cm.values.max(),
                                      target_cm.values.max()))
                        path = (plot_confusion_matrix(source_cm,
                                                      vrange=vrange,
                                                      xlabel='raw',
                                                      ylabel='actual'),
                                plot_confusion_matrix(target_cm,
                                                      vrange=vrange,
                                                      xlabel='synth',
                                                      ylabel='actual'))
                        content['svm'].append({'column': col, 'path': path})
                    except ValueError as e:
                        print(e)
                else:
                    # If not, will compare two predicted result.
                    try:
                        cm = self.classify(col, test=test)
                        # make path's type: 1-tuple
                        path = (plot_confusion_matrix(cm,
                                                      xlabel='synth',
                                                      ylabel='raw'), )
                        content['svm'].append({'column': col, 'path': path})
                    except ValueError as e:
                        print(e)

        svms = content['svm'] if 'svm' in content else []
        if buf:
            with open(buf, 'w+', encoding='utf-8') as file:
                file.write(
                    template.render(title=title,
                                    basics=content['basic'],
                                    dists=content['dist'],
                                    corrs=content['corr'],
                                    svms=svms))
Exemple #13
0
class BiFrame:
    def __init__(self,
                 first: pd.DataFrame,
                 second: pd.DataFrame,
                 categories=None):
        """
        BiFrame class that contains two data sets, which currently provides
        kinds of analysis methods from distribution, correlation, and some
        machine learning tasks.
        Especially, if the input data sets are source and synthesized dataset,
        this class can be used to evaluate the utility and privacy of
        synthesized data set.

        Parameters
        ----------
        first : {pandas.DataFrame}
            first data set (i.e. original dataset)

        second : {pandas.DataFrame}
            second data set (i.e. synthesized dataset)

        categories : list of columns
            Column names whose values are categorical.
        """
        # To compare two data sets, make sure that they have same columns. If
        # not, compare them on their common columns.
        cols = set(first.columns) & set(second.columns)
        if len(cols) != len(first.columns) or len(cols) != len(second.columns):
            warnings.warn("Evaluate on partial columns of the datasets",
                          stacklevel=2)

        categories = [] if categories is None else categories
        self.fst = DataSet(first[cols], categories=categories)
        self.snd = DataSet(second[cols], categories=categories)

        # Make sure that two dataset have same domain for categorical
        # attributes, and same min, max values for numerical attributes.
        for col in cols.copy():
            # If current column is not categorical, will ignore it.
            if not self.fst[col].categorical or not self.snd[col].categorical:
                continue
            fst_domain, snd_domain = self.fst[col].domain, self.snd[col].domain
            if not np.array_equal(fst_domain, snd_domain):
                # if there is no intersection of two domains, then there may be
                # zero relationship between the columns.
                if len(np.intersect1d(fst_domain, snd_domain)) == 0:
                    self.fst = self.fst.drop(col, axis=1)
                    self.snd = self.snd.drop(col, axis=1)
                    cols.remove(col)
                    continue
                if self.fst[col].categorical:
                    domain = np.unique(np.concatenate(
                        (fst_domain, snd_domain)))
                else:
                    domain = [
                        min(fst_domain[0], snd_domain[0]),
                        max(fst_domain[1], snd_domain[1])
                    ]
                self.fst[col].domain = domain
                self.snd[col].domain = domain
        self._columns = sorted(cols)

    @property
    def columns(self):
        """ Return the common columns of two datasets. """
        return self._columns

    def err(self):
        """
        Return pairwise err (relative error) of columns' distribution.
        """
        # merge two frequency counts, and calculate relative difference
        frame = pd.DataFrame(columns=self.columns, index=['err'])
        frame.fillna(0)
        for col in self.columns:
            frame.at['err', col] = relative_error(self.fst[col].counts(),
                                                  self.snd[col].counts())
        return frame

    def jsd(self):
        """
        Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution.
        """
        frame = pd.DataFrame(columns=self.columns, index=['jsd'])
        frame.fillna(0)
        for col in self.columns:
            frame.at['jsd',
                     col] = jensen_shannon_divergence(self.fst[col].counts(),
                                                      self.snd[col].counts())
        return frame

    def corr(self):
        """
        Return pairwise correlation and dependence measured by mi (mutual
        information).
        """
        return self.fst.mi(), self.snd.mi()

    def dist(self, column):
        """
        Return frequency distribution of one column.

        Parameters
        ----------
        column : str
            column name, whose distribution will be return
        """
        if column not in self.columns:
            raise ValueError(f"{column} is not in current dataset.")
        if self.fst[column].categorical:
            bins = self.fst[column].domain
            fst_counts = self.fst[column].counts(bins=bins)
            snd_counts = self.snd[column].counts(bins=bins)
        else:
            min_, max_ = self.fst[column].domain
            # the domain from two data set are same;
            # extend the domain to human-readable range
            bins = normalize_range(min_, max_ + 1)
            fst_counts = self.fst[column].counts(bins=bins)
            snd_counts = self.snd[column].counts(bins=bins)
            # Note: index, value of np.histogram has different length
            bins = bins[:-1]
        # stack arrays vertically
        return bins, np.vstack((fst_counts, snd_counts))

    def describe(self):
        """
        Give descriptive difference between two data sets, which concluded
        relative errors, and jsd divergence.
        Return a panda.DataFrame, whose columns are two dataset's columns, and
        indexes are a array of metrics, e.g. ['err', 'jsd'].
        """
        err_frame = self.err()
        jsd_frame = self.jsd()
        return pd.concat([err_frame, jsd_frame])

    def classify(self, label: str, test: pd.DataFrame = None):
        """
        Train two svm classifiers based on data sets, and predict class labels
        for test data. Return both error rates.

        Parameters
        ----------
        label : str
            classifier feature, key is one column in left data frame.
            It supports two-class and multi-class.

        test : {pandas.DataFrame}
            test frame, is test data for machine learning algorithms. If it is
            not provided, it will split 20% of left data frame as test data.

        Returns
        -------
        a DataFrame, e.g.
                         target                         source     target
                      male female                    male female male female
        source male   1    3        or actual male   1    3      1    2
               female 2    4                  female 2    4      3    4
        """
        if not self.fst[label].categorical or not self.snd[label].categorical:
            raise ValueError(f'Must classify on categorical column')

        # If test dataset is not provided, then split 20% of original dataset
        # for testing.
        if test is None:
            fst_train, test = train_test_split(self.fst, test_size=0.2)
            snd_train, _ = train_test_split(self.snd, test_size=0.2)
        else:
            fst_train = self.fst
            snd_train = self.snd

        fst_train_x, fst_train_y = split_feature_class(
            label, self.fst.encode(data=fst_train))
        snd_train_x, snd_train_y = split_feature_class(
            label, self.fst.encode(data=snd_train))
        test_x, test_y = split_feature_class(label, self.fst.encode(data=test))

        # construct svm classifier, and predict on the same test dataset
        fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x)
        snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x)

        columns = self.fst[label].bins
        labels = range(len(columns))
        # If test dataset has the columns as class label for prediction, return
        # two expected scores: (self.fst) original dataset's and (self.snd)
        # synthesized dataset's confusion matrix.
        if label in test:
            fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels)
            snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels)
            return (pd.DataFrame(fst_matrix, columns=columns, index=columns),
                    pd.DataFrame(snd_matrix, columns=columns, index=columns))

        # If test dataset does not have the class label for prediction, return
        # their predicted values.
        matrix = confusion_matrix(fst_predict_y, snd_predict_y, labels=labels)
        return pd.DataFrame(matrix, columns=columns, index=columns)

    def to_html(self,
                buffer,
                title='Evaluation Report',
                labels=None,
                test=None):
        """
        Render the evaluation result of two datasets to an HTML file.

        The result contains:
        + basic information of two data set (relative error, and Jensen-Shannon
            divergence (jsd));
        + distribution of each attribute;
        + correlation of pair-wise attributes;
        + classification result by SVM to train data set on one or more columns
            (defined by parameter 'labels' and 'test' dataset).

        Parameters
        ----------
        buffer
            buffer to write to

        title : str
            title of evaluation report

        labels : list of column names
            column name, or a list of column names separated by comma, used for
            classification task.

        test : pd.DataFrame
            test data for classification, and other machine learning tasks.
        """
        basics = [self.describe().to_dict('split')]
        svms = self._get_svm_classifier(labels=labels, test=test)

        template = BiFrame._construct_template()
        with open(buffer, 'w+', encoding='utf-8') as file:
            file.write(
                template.render(title=title,
                                basics=basics,
                                dists=self._get_dist(),
                                corrs=self._get_corr(),
                                svms=svms))

    def _get_svm_classifier(self, labels=None, test=None):
        if labels is None:
            return []

        from ds4ml.utils import plot_confusion_matrix
        svms = []
        for col in labels:
            in_test = (test is not None and col in test) or (test is None)
            if in_test:
                # When class label in svm classify test data, try to match
                # two predicted result with the actual data, and so, there
                # will be two confusion matrix diagrams.
                src_matrix, tgt_matrix = self.classify(col, test=test)
                vrange = (min(src_matrix.values.min(),
                              tgt_matrix.values.min()),
                          max(src_matrix.values.max(),
                              tgt_matrix.values.max()))
                path = (plot_confusion_matrix(src_matrix,
                                              vrange=vrange,
                                              xlabel='raw',
                                              ylabel='actual'),
                        plot_confusion_matrix(tgt_matrix,
                                              vrange=vrange,
                                              xlabel='synth',
                                              ylabel='actual'))
                svms.append({'column': col, 'path': path})
            else:
                # If not, will compare two predicted result.
                matrix = self.classify(col, test=test)
                # make path's type: 1-tuple
                path = (plot_confusion_matrix(matrix,
                                              xlabel='synth',
                                              ylabel='raw'))
                svms.append({'column': col, 'path': path})
        return svms

    @staticmethod
    def _construct_template():
        """ construct template from a html """
        from mako.template import Template
        import os
        old_cwd = os.getcwd()
        os.chdir(os.path.dirname(__file__))
        template = Template(filename='template/report.html')
        os.chdir(old_cwd)
        return template

    def _get_dist(self):
        """ return the distribution information """
        from ds4ml.utils import plot_histogram
        dists = []
        for col in self.columns:
            bins, counts = self.dist(col)
            svg = plot_histogram(bins, counts)
            dists.append({
                'name': col,
                'columns': bins,
                'data': counts,
                'path': svg
            })
        return dists

    def _get_corr(self):
        """ return the pair-wise correlation """
        from ds4ml.utils import plot_heatmap
        corrs = []
        fst_mi, snd_mi = self.corr()
        fst_svg = plot_heatmap(fst_mi)
        snd_svg = plot_heatmap(snd_mi)
        corrs.append({'matrix': fst_mi.to_dict('split'), 'path': fst_svg})
        corrs.append({'matrix': snd_mi.to_dict('split'), 'path': snd_svg})
        return corrs
def main():
    parser = argparse.ArgumentParser(
        description='Serialize patterns of a dataset anonymously',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of a csv file to be patterned '
                                     'anonymously')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h", "--help", action="help",
                       help="show this help message and exit")
    group.add_argument('--pseudonym', metavar='LIST',
                       help='set candidate columns separated by a comma, which '
                            'will be replaced with a pseudonym. It only works '
                            'on the string column.')
    group.add_argument('--delete', metavar='LIST',
                       help='set columns separated by a comma, which will be '
                            'deleted when synthesis.')
    group.add_argument('--na-values', metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                            '(default null values are from pandas.read_csv)')
    group.add_argument('-o', '--output', metavar='FILE',
                       help="set the file name of anonymous patterns (default "
                            "is input file name with a suffix '-pattern.json')")
    group.add_argument('--no-header', action='store_true',
                       help='indicate there is no header in a CSV file, and '
                            'will take [#0, #1, #2, ...] as header. (default: '
                            'the tool will try to detect and take actions)')
    group.add_argument('--sep', metavar='STRING',
                       help='specify the delimiter of the input file')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float,
                       help='set epsilon for differential privacy (default 0.1)',
                       default=0.1)
    group.add_argument('--category', metavar='LIST',
                       help='set categorical columns separated by a comma.')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    header = None if args.no_header else 'infer'
    sep = ',' if args.sep is None else args.sep

    data = read_data_from_csv(args.file, na_values=na_values, header=header,
                              sep=sep)

    def complement(attrs, full):
        return set(attrs or []) - set(full)

    # check parameters: pseudonyms, deletes, categories
    comp = complement(pseudonyms, data.columns)
    if comp:
        parser.exit(message=f'--pseudonym columns: {comp} are not in csv file.')
    comp = complement(deletes, data.columns)
    if comp:
        parser.exit(message=f'--delete columns: {comp} are not in csv file.')
    comp = complement(categories, data.columns)
    if comp:
        parser.exit(message=f'--category columns: {comp} are not in csv file.')

    dataset = DataSet(data, categories=categories)

    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}-pattern.json'
    dataset.to_pattern(path=args.output, epsilon=args.epsilon, deletes=deletes,
                       pseudonyms=pseudonyms, retains=[])

    duration = time.time() - start
    print(f'Analyze and serialize the patterns of {args.file} at {args.output} '
          f'in {round(duration, 2)} seconds.')
Exemple #15
0
def main():
    parser = argparse.ArgumentParser(
        description='Synthesize one dataset by differential privacy',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of a csv file to be synthesized '
                                     'or path of a pattern file to be generated')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h", "--help", action="help",
                       help="show this help message and exit")
    group.add_argument('--pseudonym', metavar='LIST',
                       help='set candidate columns separated by a comma, which '
                            'will be replaced with a pseudonym. It only works '
                            'on the string column.')
    group.add_argument('--delete', metavar='LIST',
                       help='set columns separated by a comma, which will be '
                            'deleted when synthesis.')
    group.add_argument('--na-values', metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                            '(default null values are from pandas.read_csv)')
    group.add_argument('-o', '--output', metavar='FILE',
                       help="set the file name of output synthesized dataset ("
                            "default is input file name with suffix '-a.csv')")
    group.add_argument('--no-header', action='store_true',
                       help='indicate there is no header in a CSV file, and '
                            'will take [#0, #1, #2, ...] as header. (default: '
                            'the tool will try to detect and take actions)')
    group.add_argument('--records', metavar='INT', type=int,
                       help='specify the records you want to generate; default '
                            'is the same records with the original dataset')
    group.add_argument('--sep', metavar='STRING', default=',',
                       help='specify the delimiter of the input file')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float,
                       help='set epsilon for differential privacy (default 0.1)',
                       default=0.1)
    group.add_argument('--category', metavar='LIST',
                       help='set categorical columns separated by a comma.')
    group.add_argument('--retain', metavar='LIST',
                       help='set columns to retain the values')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    retains = str_to_list(args.retain)
    header = None if args.no_header else 'infer'

    # check the file type from its extension
    is_pattern = ends_with_json(args.file)
    if is_pattern:
        if retains is not None and len(retains) != 0:
            parser.exit(message='Do not support --retain option when '
                                'synthesize from pattern file.')
        # construct DataSet from pattern file
        dataset = DataSet.from_pattern(args.file)
    else:
        data = read_data_from_csv(args.file, na_values=na_values, header=header,
                                  sep=args.sep)

        def complement(attrs, full):
            return set(attrs or []) - set(full)

        # check parameters: pseudonyms, deletes, categories
        comp = complement(pseudonyms, data.columns)
        if comp:
            parser.exit(
                message=f'--pseudonym columns: {comp} are not in csv file.')
        comp = complement(deletes, data.columns)
        if comp:
            parser.exit(
                message=f'--delete columns: {comp} are not in csv file.')
        comp = complement(categories, data.columns)
        if comp:
            parser.exit(
                message=f'--category columns: {comp} are not in csv file.')

        dataset = DataSet(data, categories=categories)

    synthesized = dataset.synthesize(epsilon=args.epsilon,
                                     pseudonyms=pseudonyms, deletes=deletes,
                                     retains=retains, records=args.records)
    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}-a.csv'
    synthesized.to_csv(args.output, index=False, sep=args.sep)

    duration = time.time() - start
    print(f'Synthesize from {args.file} to file {args.output} in '
          f'{round(duration, 2)} seconds.')
def main():
    parser = argparse.ArgumentParser(
        description='Synthesize one dataset by Differential Privacy',
        formatter_class=CustomFormatter,
        add_help=False)
    parser.add_argument('file', help='set path of the CSV to be synthesized')

    # optional arguments
    group = parser.add_argument_group('general arguments')
    group.add_argument("-h",
                       "--help",
                       action="help",
                       help="show this help message and exit")
    group.add_argument(
        '--pseudonym',
        metavar='LIST',
        help='set candidate columns separated by a comma, which will be '
        'replaced with a pseudonym. '
        'It only works on the string column.')
    group.add_argument('--delete',
                       metavar='LIST',
                       help='set columns separated by a comma, which will be '
                       'deleted when synthesis.')
    group.add_argument('--na-values',
                       metavar='LIST',
                       help='set additional values to recognize as NA/NaN; '
                       '(default null values are from pandas.read_csv)')
    group.add_argument('-o',
                       '--output',
                       metavar='FILE',
                       help="set the file name of output synthesized dataset ("
                       "default is input file name with suffix '_a')")
    group.add_argument('--no-header',
                       action='store_true',
                       help='indicate there is no header in a CSV file, and '
                       'will take [#0, #1, #2, ...] as header. (default: '
                       'the tool will try to detect and take actions)')

    group = parser.add_argument_group('advanced arguments')
    group.add_argument(
        '-e',
        '--epsilon',
        metavar='FLOAT',
        type=float,
        help='set epsilon for differential privacy (default 0.1)',
        default=0.1)
    group.add_argument('--category',
                       metavar='LIST',
                       help='set categorical columns separated by a comma.')
    group.add_argument('--retain',
                       metavar='LIST',
                       help='set columns to retain the values')

    args = parser.parse_args()
    start = time.time()

    pseudonyms = str_to_list(args.pseudonym)
    deletes = str_to_list(args.delete)
    categories = str_to_list(args.category)
    na_values = str_to_list(args.na_values)
    retains = str_to_list(args.retain)
    header = None if args.no_header else 'infer'

    data = read_data_from_csv(args.file, na_values=na_values, header=header)

    def complement(attrs, full):
        return set(attrs or []) - set(full)

    # check parameters: pseudonyms, deletes, categories
    comp = complement(pseudonyms, data.columns)
    if comp:
        parser.exit(
            message=f'--pseudonym columns: {comp} are not in csv file.')
    comp = complement(deletes, data.columns)
    if comp:
        parser.exit(message=f'--delete columns: {comp} are not in csv file.')
    comp = complement(categories, data.columns)
    if comp:
        parser.exit(message=f'--category columns: {comp} are not in csv file.')

    dataset = DataSet(data, categories=categories)
    synthesized = dataset.synthesize(epsilon=args.epsilon,
                                     pseudonyms=pseudonyms,
                                     deletes=deletes,
                                     retains=retains)
    if args.output is None:
        name = file_name(args.file)
        args.output = f'{name}_a.csv'
    synthesized.to_csv(args.output, index=False)

    duration = time.time() - start
    print(f'Synthesized data {args.output} in {round(duration, 2)} seconds.')