Exemple #1
0
    def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(
            self):
        test_data_frame = pd.DataFrame({
            "summary": [
                "First President of Singapore NRIC was S0000001I",
                "A typical email id would look something like [email protected]"
            ],
            "phone number": [
                "Some examples of phone numbers are +65 62345678",
                "Some examples of phone numbers are +65 62345678"
            ]
        })

        actual = self.pii_detector.analyze_data_frame(test_data_frame)

        expected_data_frame = pd.DataFrame({
            "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
                        [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]],
            "phone number":
            [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
             [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]
        })

        pd.testing.assert_frame_equal(expected_data_frame,
                                      actual["analyzer_results"])
Exemple #2
0
    def test_analyze_data_frame_runs_analyze_only_on_cells_with_a_PII_value(
            self):
        test_data_frame = pd.DataFrame({
            "summary": [
                "First President of Singapore NRIC was S0000001I",
                "A typical email id would look something like [email protected]"
            ],
            "remarks": ["No sensitive data", "No sensitive data"]
        })

        actual_report, actual_result = self.pii_detector.analyze_data_frame(
            test_data_frame)

        expected_report = pd.DataFrame({
            "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
                        [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]],
            "remarks": [[], []]
        })

        expected_result = pd.DataFrame({
            "summary": [
                "First President of Singapore NRIC was ",
                "A typical email id would look something like "
            ],
            "remarks": ["No sensitive data", "No sensitive data"]
        })

        pd.testing.assert_frame_equal(expected_report, actual_report)
        pd.testing.assert_frame_equal(expected_result, actual_result)
 def test_redact_for_multiple_analyzer_results(self):
     text = "text containing pii1 and pii2"
     analyzer_results = [
         AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
         AnalyzerResult("pii2", "PII_DETECTOR", 25, 28)
     ]
     result = DropAnonymizer.redact(text, analyzer_results)
     self.assertEqual(result, "text containing  and ")
Exemple #4
0
 def test_execute_returns_all_matches_when_more_than_one(self):
     results = self.test_class.execute(
         "First President of Singapore NRIC was S0000001I and the second president's was T0000001R"
     )
     self.assertEqual(len(results), 2)
     self.assertCountEqual([
         AnalyzerResult("S0000001I", "NRIC", 38, 47),
         AnalyzerResult("T0000001R", "NRIC", 79, 88)
     ], results)
 def test_calculate_detector_stats_returns_detector_counts_and_percentages(
         self):
     result_column_values = pd.Series(
         [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
          [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)],
          [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]])
     actual_result = self.report_generator_medium_level.calculate_detector_stats_for_each_column(
         result_column_values)
     expected_result = {"NRIC": (1, "33.33%"), "EMAIL": (2, "66.67%")}
     self.assertCountEqual(expected_result, actual_result)
Exemple #6
0
 def test_get_pii_list_returns_list_of_pii_words_given_row_of_list_of_analyzer_results(
         self):
     test_row = Row(summary=[
         AnalyzerResult("S0000001I", "NRIC", 38, 47),
         AnalyzerResult("S0000002I", "NRIC", 38, 47)
     ],
                    phone_number=[
                        AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35,
                                       47)
                    ])
     actual = self.pii_detector._get_pii_list(test_row)
     expected = ["S0000001I", "S0000002I", "+65 62345678"]
     self.assertEqual(actual, expected)
Exemple #7
0
    def test_should_detect_and_redact_all_pii_fields_in_text(self):
        actual = self.pii_detector.analyze_and_redact(
            """First President of Singapore NRIC was S0000001I.
                                         A typical email id would look something like [email protected]"""
        )
        expected_redacted_text = """First President of Singapore NRIC was .
                                         A typical email id would look something like """

        expected = AnonymizerResult(expected_redacted_text, [
            AnalyzerResult("*****@*****.**", "EMAIL", 135, 150),
            AnalyzerResult("S0000001I", "NRIC", 38, 47)
        ])
        self.assertEqual(actual, expected)
 def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(
         self):
     result_data_frame = pd.DataFrame({
         "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
                     [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]],
         "phone number":
         [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
          [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]
     })
     expected_data_frame = pd.DataFrame(
         {"Columns with PII values": ["summary", "phone number"]})
     self.assertCountEqual(
         expected_data_frame,
         self.report_generator_high_level.generate_report_content(
             result_data_frame))
Exemple #9
0
 def test_should_detect_and_redact_email_in_text(self):
     actual = self.pii_detector.analyze_and_redact(
         "A typical email id would look something like [email protected]")
     expected = AnonymizerResult(
         "A typical email id would look something like ",
         [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)])
     self.assertEqual(actual, expected)
Exemple #10
0
 def test_should_detect_and_redact_nric_in_text(self):
     actual = self.pii_detector.analyze_and_redact(
         "First President of Singapore NRIC was S0000001I")
     expected = AnonymizerResult(
         "First President of Singapore NRIC was ",
         [AnalyzerResult("S0000001I", "NRIC", 38, 47)])
     self.assertEqual(actual, expected)
 def test_generate_report_calls_content_generate_report_content_and_logs_it(
         self, mock_generate_content, mock_logging):
     result_data_frame = pd.DataFrame({
         "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
                     [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]],
         "phone number":
         [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
          [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]
     })
     mock_generate_content.return_value = pd.DataFrame(
         {"Columns with PII values": ["summary", "phone number"]})
     mock_logging.return_value = None
     expected_result = self.report_generator_high_level.generate(
         result_data_frame)
     self.assertCountEqual(expected_result,
                           mock_generate_content.return_value)
Exemple #12
0
 def test_should_detect_and_redact_phone_in_text(self):
     actual = self.pii_detector.analyze_and_redact(
         "Some examples of phone numbers are +65 62345678")
     expected = AnonymizerResult(
         "Some examples of phone numbers are ",
         [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
     self.assertEqual(actual, expected)
Exemple #13
0
 def test_inequality(self):
     self.assertNotEqual(AnalyzerResult("text", "type", 0, 10),
                         AnalyzerResult("different_text", "type", 0, 10))
     self.assertNotEqual(AnalyzerResult("text", "type", 0, 10),
                         AnalyzerResult("text", "different_type", 0, 10))
     self.assertNotEqual(AnalyzerResult("text", "type", 0, 10),
                         AnalyzerResult("text", "type", 1, 10))
     self.assertNotEqual(AnalyzerResult("text", "type", 0, 10),
                         AnalyzerResult("text", "type", 0, 11))
 def execute(self, text):
     results = []
     matches = re.finditer(self.get_pattern(), text)
     for match in matches:
         matched_string = match.string[match.start(): match.end()]
         if self.validate(matched_string):
             results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end()))
     return results
Exemple #15
0
    def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(
            self):
        test_data_frame = self.SPARK.createDataFrame(
            [("First President of Singapore NRIC was S0000001I",
              "Some examples of phone numbers are +65 62345678"),
             ("A typical email id would look something like [email protected]",
              "Some examples of phone numbers are +65 62345678")],
            ["summary", "phone number"])

        actual = self.pii_detector.get_analyzer_results(test_data_frame)

        expected_data_frame = self.SPARK.createDataFrame(
            [([AnalyzerResult("S0000001I", "NRIC", 38, 47)
               ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
             ([AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)
               ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])],
            self.schema)

        self.assertEqual(actual.schema, expected_data_frame.schema)
        self.assertEqual(actual.collect(), expected_data_frame.collect())
Exemple #16
0
    def test_get_redacted_text_returns_redacted_data_frame(self):
        test_report_data_frame = self.SPARK.createDataFrame(
            [([AnalyzerResult("S0000001I", "NRIC", 38, 47)
               ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
             ([
                 AnalyzerResult("*****@*****.**", "EMAIL", 6, 21),
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)
             ], [
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22),
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)
             ])], self.schema)

        test_input_data_frame = self.SPARK.createDataFrame(
            [("First President of Singapore NRIC was S0000001I",
              "Some examples of phone numbers are +65 62345678"),
             ("email [email protected] and phone +65 62345678",
              "Phone one +65 62345678 Phone two +65 62345678")],
            ["summary", "phone number"])

        actual = self.pii_detector.get_redacted_text(test_input_data_frame,
                                                     test_report_data_frame)

        expected = self.SPARK.createDataFrame(
            [("First President of Singapore NRIC was ",
              "Some examples of phone numbers are "),
             ("email  and phone ", "Phone one  Phone two ")],
            ["summary", "phone number"])

        self.assertEqual(actual.schema, expected.schema)
        self.assertEqual(actual.collect(), expected.collect())
 def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(
         self):
     result_data_frame = pd.DataFrame({
         "summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
                     [AnalyzerResult("*****@*****.**", "EMAIL", 45, 60)]],
         "phone number":
         [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
          [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]
     })
     expected_data_frame = pd.DataFrame({
         "summary":
         pd.Series({
             "NRIC": (1, "50%"),
             "EMAIL": (1, "50%")
         }),
         "phone number":
         pd.Series({"PHONE_NUMBER": (2, "100%")})
     })
     self.assertCountEqual(
         list(expected_data_frame),
         self.report_generator_medium_level.generate_report_content(
             result_data_frame))
Exemple #18
0
    def test_analyze_data_frame_runs_analyze_against_cell_with_multiple_PII_values(
            self):
        test_data_frame = self.SPARK.createDataFrame(
            [("First President of Singapore NRIC was S0000001I",
              "Some examples of phone numbers are +65 62345678"),
             ("email [email protected] and phone +65 62345678",
              "Phone one +65 62345678 Phone two +65 62345678")],
            ["summary", "phone number"])

        actual = self.pii_detector.get_analyzer_results(test_data_frame)

        expected_data_frame = self.SPARK.createDataFrame(
            [([AnalyzerResult("S0000001I", "NRIC", 38, 47)
               ], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
             ([
                 AnalyzerResult("*****@*****.**", "EMAIL", 6, 21),
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)
             ], [
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22),
                 AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)
             ])], self.schema)

        self.assertEqual(actual.schema, expected_data_frame.schema)
        self.assertEqual(actual.collect(), expected_data_frame.collect())
Exemple #19
0
 def __assert_single_result(self, text_to_be_tested, start, end):
     actual = self.phone_number_detector.execute(text_to_be_tested)
     expected = AnalyzerResult(text_to_be_tested, "PHONE_NUMBER", start,
                               end)
     self.assertEqual(len(actual), 1)
     self.assertEqual(expected, actual[0])
 def test_redact_for_single_analyzer_result(self):
     text = "text containing pii"
     analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
     result = DropAnonymizer.redact(text, analyzer_results)
     self.assertEqual(result, "text containing ")
Exemple #21
0
 def test_str(self):
     expected = "Text sample_data at position (0,10) was identified as type"
     self.assertEqual(str(AnalyzerResult("sample_data", "type", 0, 10)),
                      expected)
Exemple #22
0
 def test_get_detector_fetches_detector_type_correctly(self):
     result = AnalyzerResult("text", "EMAIL", 0, 10)
     self.assertEqual(result.detector(), "EMAIL")
Exemple #23
0
 def test_execute_calls_match_and_validate(self):
     results = self.test_class.execute(
         "First President of Singapore NRIC was S0000001I")
     self.assertEqual(len(results), 1)
     self.assertEqual(AnalyzerResult("S0000001I", "NRIC", 38, 47),
                      results[0])
Exemple #24
0
 def test_equality(self):
     expected = AnalyzerResult("text", "type", 0, 10)
     actual = AnalyzerResult("text", "type", 0, 10)
     self.assertEqual(expected, actual)