コード例 #1
0
 def test_valid_anonymization_by_label(self):
     with self.assertRaises(ValueError):
         anonymization.anonymize_replace(
             df_values,
             eval_column='values',
             evaluator=lambda x: x in ['one', 'two', 'three'],
             anonymize_eval=False)
コード例 #2
0
    def test_replace_based_on_label_list(self):
        exp_df_out_str = pd.DataFrame(
            data={
                'values': ['one', '*', 'three', 'four', 'five'],
                'ints': [1, 2, 3, 4, 5],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out_str = anonymization.anonymize_replace(
            df_values, eval_column='values', evaluator=lambda x: x in ['two'])
        self.assertTrue(df_out_str.equals(exp_df_out_str))

        exp_df_out_str = pd.DataFrame(
            data={
                'values': ['one', '*', 'three', '*', 'five'],
                'ints': [1, 2, 3, 4, 5],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out_str = anonymization.anonymize_replace(
            df_values,
            eval_column='values',
            evaluator=lambda x: x in ['two', 'four'])
        self.assertTrue(df_out_str.equals(exp_df_out_str))

        exp_df_out_num = pd.DataFrame(
            data={
                'values': ['one', 'two', 'three', 'four', 'five'],
                'ints': [1, "*", 3, "*", 5],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out_num = anonymization.anonymize_replace(
            df_values, eval_column='ints', evaluator=lambda x: x in [2, 4])
        self.assertTrue(df_out_num.equals(exp_df_out_num))

        exp_df_out_num = pd.DataFrame(
            data={
                'values': ['one', 'two', 'three', 'four', 'five'],
                'ints': [1, 2, 3, 4, 5],
                'floats': [1.0, "*", 3.0, "*", 5.0]
            })
        df_out_num = anonymization.anonymize_replace(
            df_values, eval_column='floats', evaluator=lambda x: x in [2, 4])
        self.assertTrue(df_out_num.equals(exp_df_out_num))

        df_obj = pd.DataFrame({
            'labels': ['one', 1, 1.0, 33],
            'ints': [1, 2, 3, 4]
        })
        df_expected_out = pd.DataFrame({
            'labels': ['*', '*', '*', 33],
            'ints': ['*', '*', '*', 4]
        })
        df_out = anonymization.anonymize_replace(
            df_obj,
            eval_column='labels',
            anonymize_columns=['ints'],
            evaluator=lambda x: x in ['one', 1])
        self.assertTrue(df_out.equals(df_expected_out))
コード例 #3
0
    def test_column_names(self):
        with self.assertRaises(AttributeError):
            anonymization.anonymize_replace(df_values,
                                            eval_column='ints',
                                            anonymize_columns='object')

        with self.assertRaises(AttributeError):
            anonymization.anonymize_replace(df_values,
                                            eval_column='objects',
                                            anonymize_columns='ints')
コード例 #4
0
    def test_no_change_in_df_in(self):
        df_in_ = df_in.copy()

        anonymization.anonymize_replace(df_in, eval_column='col1')
        self.assertTrue(df_in.equals(df_in_))

        anonymization.anonymize_replace(df_in,
                                        eval_column='col1',
                                        evaluator=lambda x: x in [1, 3])
        self.assertTrue(df_in.equals(df_in_))
コード例 #5
0
 def test_replace_based_on_label_NoneType(self):
     df_none = pd.DataFrame({'floats': [None, 1.0], 'ints': [1, 1]})
     expected_df_out = pd.DataFrame({'floats': [0.0, 1.0], 'ints': [1, 1]})
     df_out_none = anonymization.anonymize_replace(
         df_none,
         eval_column='floats',
         evaluator=lambda x: np.isnan(x),
         replace_by=0)
     self.assertTrue(df_out_none.equals(expected_df_out))
コード例 #6
0
    def test_anonymize_column_str_name(self):
        exp_df_out = pd.DataFrame(
            data={
                'values': ['one', 'two', 'three', 'four', 'five'],
                'ints': ["*", "*", "*", 4, 5],
                'floats': ["*", "*", "*", 4.0, 5.0]
            })

        df_out = anonymization.anonymize_replace(df_values,
                                                 eval_column='ints',
                                                 anonymize_columns='floats')
        self.assertTrue(df_out.equals(exp_df_out))

        df_out = anonymization.anonymize_replace(
            df_values,
            eval_column='ints',
            anonymize_columns='floats',
            evaluator=lambda x: x in [1, 2, 3])
        self.assertTrue(df_out.equals(exp_df_out))
コード例 #7
0
    def test_replace_based_on_single_label(self):
        exp_df_out_str = pd.DataFrame(
            data={
                'values': ['one', '*', 'three', 'four', 'five'],
                'ints': [1, 2, 3, 4, 5],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out_str = anonymization.anonymize_replace(
            df_values, eval_column='values', evaluator=lambda x: x == 'two')
        self.assertTrue(df_out_str.equals(exp_df_out_str))

        exp_df_out_int = pd.DataFrame(
            data={
                'values': ['one', 'two', 'three', 'four', 'five'],
                'ints': [1, "*", 3, 4, 5],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out_int = anonymization.anonymize_replace(
            df_values, eval_column='ints', evaluator=lambda x: x == 2)
        self.assertTrue(df_out_int.equals(exp_df_out_int))
コード例 #8
0
 def test_replace_based_on_lower_limit(self):
     expected_df_out = pd.DataFrame(
         data={
             'col1': ["*", "*", "*", "*", 5, 6],
             'col2': ["*", "*", "*", "*", 765, 1111]
         })
     df_out = anonymization.anonymize_replace(df_in,
                                              eval_column='col1',
                                              anonymize_columns=['col2'],
                                              evaluator=lambda x: x < 5)
     self.assertTrue(df_out.equals(expected_df_out))
コード例 #9
0
    def test_replace_by_list(self):
        exp_df_out = pd.DataFrame(data={
            'col1': [0, 0, 0, 4, 5, 6],
            'col2': [None, None, None, 67, 765, 1111]
        })

        df_out = anonymization.anonymize_replace(df_in,
                                                 eval_column='col1',
                                                 anonymize_columns=['col2'],
                                                 replace_by=[None, 0])
        self.assertTrue(df_out.equals(exp_df_out))
コード例 #10
0
    def test_replace_w_defaults(self):
        df_in_anonymizable_cols = pd.DataFrame(data={
            'col1': [1, 2, 3, 4, 5, 6],
            'col2': [3, 4, 5, 6, 7, 8]
        })
        expected_df_out = pd.DataFrame(data={
            'col1': ["*", "*", "*", 4, 5, 6],
            'col2': [3, 4, 5, 6, 7, 8]
        })

        df_out_all_defaults = anonymization.anonymize_replace(
            df_in_anonymizable_cols, eval_column='col1')
        self.assertTrue(df_out_all_defaults.equals(expected_df_out))
コード例 #11
0
 def test_replace_by_single_value(self):
     for r_val in [None, 1.5, 5, 'n/a']:
         expected_df_out = pd.DataFrame(
             data={
                 'col1': [r_val, r_val, r_val, 4, 5, 6],
                 'col2': [r_val, r_val, r_val, 67, 765, 1111]
             })
         df_out = anonymization.anonymize_replace(
             df_in,
             eval_column='col1',
             anonymize_columns=['col2'],
             replace_by=r_val)
         self.assertTrue(df_out.equals(expected_df_out))
コード例 #12
0
    def test_not_anonymize_eval_col(self):
        expected_df_out = pd.DataFrame(
            data={
                'values': ['one', 'two', 'three', 'four', 'five'],
                'ints': ['*', 2, '*', 4, '*'],
                'floats': [1.0, 2.0, 3.0, 4.0, 5.0]
            })
        df_out = anonymization.anonymize_replace(
            df_values,
            eval_column='values',
            anonymize_columns=['ints'],
            evaluator=lambda x: x in ['one', 'three', 'five'],
            anonymize_eval=False)
        self.assertTrue(df_out.equals(expected_df_out))

        df_out = anonymization.anonymize_replace(
            df_values,
            eval_column='values',
            anonymize_columns=['values', 'ints'],
            evaluator=lambda x: x in ['one', 'three', 'five'],
            anonymize_eval=False)
        self.assertTrue(df_out.equals(expected_df_out))
コード例 #13
0
ファイル: dataverk.py プロジェクト: navikt/dataverk
    def anonymize(
        self,
        df,
        eval_column,
        anonymize_columns=None,
        evaluator=lambda x: x < 4,
        replace_by="*",
        anonymize_eval=True,
    ):
        """ Replace values in columns when condition in evaluator is satisfied

        :param df: pandas DataFrame
        :param eval_column: name of column to evaluate for anonymization
        :param anonymize_columns: optional, column name or list of column(s) to anonymize if value in eval_column satisfies the
        condition given in evaluator, default=None
        :param evaluator: lambda function, condition for anonymization based on values in eval_column, default=lambda x: x < 4
        :param replace_by: value or list or dict of values to replace by. List or dict passed must have same length as the number
        of columns to anonymize. Elements in list passed should in addition have the same order as columns in

        a) anonymize_columns + eval_columns if anonymize_eval=True and eval_column is _not_ given in anonymize_columns
        b) anonymize_columns                if anonymize_eval=True and eval_column is given in anonymize_columns
                                            or anonymize_eval=False
        c) eval_column                      if anonymize_eval=True and anonymize_columns is None or anonymize_columns=[]

        The order of values to replace by in dictionary does not matter.

        :param anonymize_eval, bool, whether to anonymize eval_column, default=True

        :return: anonymized pandas DataFrame
        """
        return anonymize_replace(
            df=df,
            eval_column=eval_column,
            anonymize_columns=anonymize_columns,
            evaluator=evaluator,
            replace_by=replace_by,
            anonymize_eval=anonymize_eval,
        )
コード例 #14
0
 def test_invalid_key_names_in_replace_by_dict(self):
     with self.assertRaises(AttributeError):
         anonymization.anonymize_replace(df_values,
                                         eval_column='ints',
                                         anonymize_columns='floats',
                                         replace_by={'ins': 0})
コード例 #15
0
 def test_anonymize_column_num_name(self):
     df_ = pd.DataFrame(data={0: [1, 2, 4], 1: [2, 3, 5]})
     exp_df_out_ = pd.DataFrame(data={0: [1, 2, 4], 1: ['*', '*', 5]})
     df_out_ = anonymization.anonymize_replace(df_, eval_column=1)
     self.assertTrue(df_out_.equals(exp_df_out_))