class BiFrame(object): def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # distribution self._dt = {} # To compare two data set, make sure that they have same columns. # If not, compare the common part. common = set(first.columns) & set(second.columns) if len(common) != len(first.columns) or len(common) != len( second.columns): logger.info(f"BiFrame constructed on attributes: {common}.") # left and right data set (ds) self.first = DataSet(first[common], categories=categories) self.second = DataSet(second[common], categories=categories) self._columns = self.first.columns.sort_values().to_list() # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in self._columns: # If current column is not categorical, will ignore it. if not self.first[col].categorical or not self.second[ col].categorical: continue d1, d2 = self.first[col].domain, self.second[col].domain if not np.array_equal(d1, d2): if self.first[col].categorical: domain = np.unique(np.concatenate((d1, d2))) else: domain = [min(d1[0], d2[0]), max(d1[1], d2[1])] self.first[col].domain = domain self.second[col].domain = domain @property def columns(self): return self._columns def err(self): """ Return pairwise err (relative error) of columns' distribution. """ # merge two frequency counts, and calculate relative difference df = pd.DataFrame(columns=self._columns, index=['err']) df.fillna(0) for col in self._columns: df.at['err', col] = relative_error(self.first[col].counts(), self.second[col].counts()) return df def jsd(self): """ Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution. """ df = pd.DataFrame(columns=self._columns, index=['jsd']) df.fillna(0) for col in self._columns: df.at['jsd', col] = jensen_shannon_divergence(self.first[col].counts(), self.second[col].counts()) return df def corr(self): """ Return pairwise correlation and dependence measured by mi (mutual information). """ return self.first.mi(), self.second.mi() def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if len(self._dt) == 0: for c in self._columns: self._dt[c] = {} if self.first[c].categorical: bins = self.first[c].domain counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) else: min_, max_ = self.first[c].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] self._dt[c]['bins'] = bins # stack arrays vertically self._dt[c]['counts'] = np.vstack((counts1, counts2)) return self._dt[column]['bins'], self._dt[column]['counts'] def describe(self): """ Give descriptive difference between two data sets, which concluded relative errors, and jsd divergence. Return a panda.DataFrame, whose columns are two dataset's columns, and indexes are a array of metrics, e.g. ['err', 'jsd']. """ df1 = self.err() df2 = self.jsd() return pd.concat([df1, df2]) def classify(self, label: str, test: pd.DataFrame = None): """ Train two svm classifiers based on data sets, and predict class labels for test data. Return both error rates. Parameters ---------- label : str classifier feature, key is one column in left data frame. It supports two-class and multi-class. test : {pandas.DataFrame} test frame, is test data for machine learning algorithms. If it is not provided, it will split 20% of left data frame as test data. Returns ------- a DataFrame, e.g. target source target male female male female male female source male 1 3 or actual male 1 3 1 2 female 2 4 female 2 4 3 4 """ if (not self.first[label].categorical or not self.second[label].categorical): raise ValueError(f'Classifier can not run on non-categorical ' f'column: {label}') from sklearn.metrics import confusion_matrix def split_feature_label(df: pd.DataFrame): # TODO need improve sub_cols sub_cols = [attr for attr in df.columns if attr.startswith(label)] if len(sub_cols) == 0: return df, None is_one_class = len(sub_cols) == 2 if is_one_class: # For one class, there are two sorted values. # e.g. ['Yes', 'No'] => [[0, 1], # [1, 0]] # Choose second column to represent this attribute. label_ = sub_cols[1] return df.drop(sub_cols, axis=1), df[label_] else: try: # merge multiple columns into one column: # [Name_A, Name_B, ..] => Name _y = df[sub_cols].apply(lambda x: Index(x).get_loc(1), axis=1) return df.drop(sub_cols, axis=1), _y except KeyError as e: print(e) print(sub_cols) print(df[sub_cols]) # If test dataset is not provided, then split 20% of original dataset # for testing. if test is None: fst_train, test = train_test_split(self.first, test_size=0.2) snd_train, _ = train_test_split(self.second, test_size=0.2) else: fst_train = self.first snd_train = self.second # ts = self.first.encode(data=fst_train) fst_train_x, fst_train_y = split_feature_label( self.first.encode(data=fst_train)) test_x, test_y = split_feature_label(self.first.encode(data=test)) snd_train_x, snd_train_y = split_feature_label( self.first.encode(data=snd_train)) # construct svm classifier, and predict on the same test dataset fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x) snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x) columns = self.first[label].bins labels = range(len(columns)) # If test dataset has the columns as class label for prediction, return # two expected scores: (self.first) original dataset's and (self.second) # anonymized dataset's confusion matrix. if label in test: fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels) snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels) # normalize the confusion matrix # fst_matrix = fst_matrix.astype('float') / fst_matrix.sum(axis=1) # snd_matrix = snd_matrix.astype('float') / snd_matrix.sum(axis=1) return (pd.DataFrame(fst_matrix, columns=columns, index=columns), pd.DataFrame(snd_matrix, columns=columns, index=columns)) # If test dataset does not have the class label for prediction, return # their predicted values. else: matrix = confusion_matrix(fst_predict_y, snd_predict_y, labels=labels) return pd.DataFrame(matrix, columns=columns, index=columns) def to_html(self, buf=None, title='Evaluation Report', info=True, distribute=True, correlate=True, classifier=None, labels=None, test=None): """ Render the evaluation result of two data set as an HTML file. Parameters ---------- buf : optional buffer to write to title : str title of evaluation report info : bool, default true show basic information of two data set, including relative error, and Jensen-Shannon divergence (jsd). distribute : bool, default true show distribution of each attribute. correlate : bool, default true show correlation of pair-wise attributes. classifier : str use classifier to train data set on one or more columns (defined by parameter 'label') and show prediction result on the evaluation report. Optional classifier: SVM. labels : list of column names column name, or a list of column names separated by comma, used for classification task. test : pd.DataFrame test data for classification, and other machine learning tasks. """ from ds4ml.utils import (plot_histogram, plot_heatmap, plot_confusion_matrix) from mako.template import Template import os old_cwd = os.getcwd() os.chdir(os.path.dirname(__file__)) template = Template(filename='template/report.html') os.chdir(old_cwd) topics = [] content = {} # format different kinds of evaluation result to unified style if info: topics.append('basic') content['basic'] = [self.describe().to_dict('split')] if distribute: topics.append('dist') content['dist'] = [] for col in self.columns: bins, counts = self.dist(col) svg = plot_histogram(bins, counts) content['dist'].append({ 'name': col, 'columns': bins, 'data': counts, 'path': svg }) if correlate: topics.append('corr') content['corr'] = [] source_mi, target_mi = self.corr() source_svg = plot_heatmap(source_mi) target_svg = plot_heatmap(target_mi) content['corr'].append({ 'matrix': source_mi.to_dict('split'), 'path': source_svg }) content['corr'].append({ 'matrix': target_mi.to_dict('split'), 'path': target_svg }) if labels is not None: topics.append('svm') content['svm'] = [] for col in labels: in_test = (test is not None and col in test) or (test is None) if in_test: # When class label in svm classify test data, try to match # two predicted result with the actual data, and so, there # will be two confusion matrix diagrams. try: source_cm, target_cm = self.classify(col, test=test) vrange = (min(source_cm.values.min(), target_cm.values.min()), max(source_cm.values.max(), target_cm.values.max())) path = (plot_confusion_matrix(source_cm, vrange=vrange, xlabel='raw', ylabel='actual'), plot_confusion_matrix(target_cm, vrange=vrange, xlabel='synth', ylabel='actual')) content['svm'].append({'column': col, 'path': path}) except ValueError as e: print(e) else: # If not, will compare two predicted result. try: cm = self.classify(col, test=test) # make path's type: 1-tuple path = (plot_confusion_matrix(cm, xlabel='synth', ylabel='raw'), ) content['svm'].append({'column': col, 'path': path}) except ValueError as e: print(e) svms = content['svm'] if 'svm' in content else [] if buf: with open(buf, 'w+', encoding='utf-8') as file: file.write( template.render(title=title, basics=content['basic'], dists=content['dist'], corrs=content['corr'], svms=svms))
class BiFrame: def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # To compare two data sets, make sure that they have same columns. If # not, compare them on their common columns. cols = set(first.columns) & set(second.columns) if len(cols) != len(first.columns) or len(cols) != len(second.columns): warnings.warn("Evaluate on partial columns of the datasets", stacklevel=2) categories = [] if categories is None else categories self.fst = DataSet(first[cols], categories=categories) self.snd = DataSet(second[cols], categories=categories) # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in cols.copy(): # If current column is not categorical, will ignore it. if not self.fst[col].categorical or not self.snd[col].categorical: continue fst_domain, snd_domain = self.fst[col].domain, self.snd[col].domain if not np.array_equal(fst_domain, snd_domain): # if there is no intersection of two domains, then there may be # zero relationship between the columns. if len(np.intersect1d(fst_domain, snd_domain)) == 0: self.fst = self.fst.drop(col, axis=1) self.snd = self.snd.drop(col, axis=1) cols.remove(col) continue if self.fst[col].categorical: domain = np.unique(np.concatenate( (fst_domain, snd_domain))) else: domain = [ min(fst_domain[0], snd_domain[0]), max(fst_domain[1], snd_domain[1]) ] self.fst[col].domain = domain self.snd[col].domain = domain self._columns = sorted(cols) @property def columns(self): """ Return the common columns of two datasets. """ return self._columns def err(self): """ Return pairwise err (relative error) of columns' distribution. """ # merge two frequency counts, and calculate relative difference frame = pd.DataFrame(columns=self.columns, index=['err']) frame.fillna(0) for col in self.columns: frame.at['err', col] = relative_error(self.fst[col].counts(), self.snd[col].counts()) return frame def jsd(self): """ Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution. """ frame = pd.DataFrame(columns=self.columns, index=['jsd']) frame.fillna(0) for col in self.columns: frame.at['jsd', col] = jensen_shannon_divergence(self.fst[col].counts(), self.snd[col].counts()) return frame def corr(self): """ Return pairwise correlation and dependence measured by mi (mutual information). """ return self.fst.mi(), self.snd.mi() def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if column not in self.columns: raise ValueError(f"{column} is not in current dataset.") if self.fst[column].categorical: bins = self.fst[column].domain fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) else: min_, max_ = self.fst[column].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] # stack arrays vertically return bins, np.vstack((fst_counts, snd_counts)) def describe(self): """ Give descriptive difference between two data sets, which concluded relative errors, and jsd divergence. Return a panda.DataFrame, whose columns are two dataset's columns, and indexes are a array of metrics, e.g. ['err', 'jsd']. """ err_frame = self.err() jsd_frame = self.jsd() return pd.concat([err_frame, jsd_frame]) def classify(self, label: str, test: pd.DataFrame = None): """ Train two svm classifiers based on data sets, and predict class labels for test data. Return both error rates. Parameters ---------- label : str classifier feature, key is one column in left data frame. It supports two-class and multi-class. test : {pandas.DataFrame} test frame, is test data for machine learning algorithms. If it is not provided, it will split 20% of left data frame as test data. Returns ------- a DataFrame, e.g. target source target male female male female male female source male 1 3 or actual male 1 3 1 2 female 2 4 female 2 4 3 4 """ if not self.fst[label].categorical or not self.snd[label].categorical: raise ValueError(f'Must classify on categorical column') # If test dataset is not provided, then split 20% of original dataset # for testing. if test is None: fst_train, test = train_test_split(self.fst, test_size=0.2) snd_train, _ = train_test_split(self.snd, test_size=0.2) else: fst_train = self.fst snd_train = self.snd fst_train_x, fst_train_y = split_feature_class( label, self.fst.encode(data=fst_train)) snd_train_x, snd_train_y = split_feature_class( label, self.fst.encode(data=snd_train)) test_x, test_y = split_feature_class(label, self.fst.encode(data=test)) # construct svm classifier, and predict on the same test dataset fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x) snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x) columns = self.fst[label].bins labels = range(len(columns)) # If test dataset has the columns as class label for prediction, return # two expected scores: (self.fst) original dataset's and (self.snd) # synthesized dataset's confusion matrix. if label in test: fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels) snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels) return (pd.DataFrame(fst_matrix, columns=columns, index=columns), pd.DataFrame(snd_matrix, columns=columns, index=columns)) # If test dataset does not have the class label for prediction, return # their predicted values. matrix = confusion_matrix(fst_predict_y, snd_predict_y, labels=labels) return pd.DataFrame(matrix, columns=columns, index=columns) def to_html(self, buffer, title='Evaluation Report', labels=None, test=None): """ Render the evaluation result of two datasets to an HTML file. The result contains: + basic information of two data set (relative error, and Jensen-Shannon divergence (jsd)); + distribution of each attribute; + correlation of pair-wise attributes; + classification result by SVM to train data set on one or more columns (defined by parameter 'labels' and 'test' dataset). Parameters ---------- buffer buffer to write to title : str title of evaluation report labels : list of column names column name, or a list of column names separated by comma, used for classification task. test : pd.DataFrame test data for classification, and other machine learning tasks. """ basics = [self.describe().to_dict('split')] svms = self._get_svm_classifier(labels=labels, test=test) template = BiFrame._construct_template() with open(buffer, 'w+', encoding='utf-8') as file: file.write( template.render(title=title, basics=basics, dists=self._get_dist(), corrs=self._get_corr(), svms=svms)) def _get_svm_classifier(self, labels=None, test=None): if labels is None: return [] from ds4ml.utils import plot_confusion_matrix svms = [] for col in labels: in_test = (test is not None and col in test) or (test is None) if in_test: # When class label in svm classify test data, try to match # two predicted result with the actual data, and so, there # will be two confusion matrix diagrams. src_matrix, tgt_matrix = self.classify(col, test=test) vrange = (min(src_matrix.values.min(), tgt_matrix.values.min()), max(src_matrix.values.max(), tgt_matrix.values.max())) path = (plot_confusion_matrix(src_matrix, vrange=vrange, xlabel='raw', ylabel='actual'), plot_confusion_matrix(tgt_matrix, vrange=vrange, xlabel='synth', ylabel='actual')) svms.append({'column': col, 'path': path}) else: # If not, will compare two predicted result. matrix = self.classify(col, test=test) # make path's type: 1-tuple path = (plot_confusion_matrix(matrix, xlabel='synth', ylabel='raw')) svms.append({'column': col, 'path': path}) return svms @staticmethod def _construct_template(): """ construct template from a html """ from mako.template import Template import os old_cwd = os.getcwd() os.chdir(os.path.dirname(__file__)) template = Template(filename='template/report.html') os.chdir(old_cwd) return template def _get_dist(self): """ return the distribution information """ from ds4ml.utils import plot_histogram dists = [] for col in self.columns: bins, counts = self.dist(col) svg = plot_histogram(bins, counts) dists.append({ 'name': col, 'columns': bins, 'data': counts, 'path': svg }) return dists def _get_corr(self): """ return the pair-wise correlation """ from ds4ml.utils import plot_heatmap corrs = [] fst_mi, snd_mi = self.corr() fst_svg = plot_heatmap(fst_mi) snd_svg = plot_heatmap(snd_mi) corrs.append({'matrix': fst_mi.to_dict('split'), 'path': fst_svg}) corrs.append({'matrix': snd_mi.to_dict('split'), 'path': snd_svg}) return corrs