コード例 #1
0
    def create_flair_corpus(self, train_samples_path, test_samples_path,
                            val_samples_path):
        """
        Create a flair Corpus object and saive it to train, test, validation files.
        :param train_samples_path: Path to train samples
        :param test_samples_path: Path to test samples
        :param val_samples_path: Path to validation samples
        :return:
        """
        if not path.exists("flair_train.txt"):
            train_samples = InputSample.read_dataset_json(train_samples_path)
            train_tagged = [
                sample for sample in train_samples if len(sample.spans) > 0
            ]
            print(
                f"Kept {len(train_tagged)} train samples after removal of non-tagged samples"
            )
            train_data = InputSample.create_conll_dataset(train_tagged)
            self.to_flair(train_data, outfile="flair_train.txt")

        if not path.exists("flair_test.txt"):
            test_samples = InputSample.read_dataset_json(test_samples_path)
            test_data = InputSample.create_conll_dataset(test_samples)
            self.to_flair(test_data, outfile="flair_test.txt")

        if not path.exists("flair_val.txt"):
            val_samples = InputSample.read_dataset_json(val_samples_path)
            val_data = InputSample.create_conll_dataset(val_samples)
            self.to_flair(val_data, outfile="flair_val.txt")
コード例 #2
0
def test_analyzer_with_generated_text(test_input, acceptance_threshold):
    """
    Test analyzer with a generated dataset text file
    :param test_input: input text file location
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """
    # read test input from generated file

    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(test_input.format(dir_path))

    updated_samples = Evaluator.align_entity_types(
        input_samples=input_samples,
        entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map,
    )

    analyzer = PresidioAnalyzerWrapper()
    evaluator = Evaluator(model=analyzer)
    evaluated_samples = evaluator.evaluate_all(updated_samples)
    scores = evaluator.calculate_score(evaluation_results=evaluated_samples)

    assert acceptance_threshold <= scores.pii_precision
    assert acceptance_threshold <= scores.pii_recall
コード例 #3
0
def test_to_conll():
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        os.path.join(dir_path, "data/generated_small.json"))

    conll = InputSample.create_conll_dataset(input_samples)

    sentences = conll["sentence"].unique()
    assert len(sentences) == len(input_samples)
コード例 #4
0
def test_dataset_to_metric_identity_model():
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        "{}/data/generated_small.json".format(dir_path), length=10)

    model = IdentityTokensMockModel()
    evaluator = Evaluator(model=model)
    evaluation_results = evaluator.evaluate_all(input_samples)
    metrics = evaluator.calculate_score(evaluation_results)

    assert metrics.pii_precision == 1
    assert metrics.pii_recall == 1
コード例 #5
0
def test_spacy_recognizer_with_generated_text(test_input, acceptance_threshold):
    """
    Test spacy recognizer with a generated dataset text file
    :param test_input: input text file location
    :param acceptance_threshold: minimim precision/recall
     allowed for tests to pass
    """

    # read test input from generated file
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(test_input.format(dir_path))
    scores = score_presidio_recognizer(
        SpacyRecognizer(), ["PERSON"], input_samples, with_nlp_artifacts=True
    )
    assert acceptance_threshold <= scores.pii_f
コード例 #6
0
def test_spacy_simple():
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        os.path.join(dir_path, "data/generated_small.json"))

    spacy_model = SpacyModel(model_name="en_core_web_sm",
                             entities_to_keep=["PERSON"])
    evaluator = Evaluator(model=spacy_model)
    evaluation_results = evaluator.evaluate_all(input_samples)
    scores = evaluator.calculate_score(evaluation_results)

    np.testing.assert_almost_equal(scores.pii_precision,
                                   scores.entity_precision_dict["PERSON"])
    np.testing.assert_almost_equal(scores.pii_recall,
                                   scores.entity_recall_dict["PERSON"])
    assert scores.pii_recall > 0
    assert scores.pii_precision > 0
コード例 #7
0
def test_credit_card_recognizer_with_generated_text(test_input,
                                                    acceptance_threshold):
    """
    Test credit card recognizer with a generated dataset text file
    :param test_input: input text file location
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """

    # read test input from generated file
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(test_input.format(dir_path))
    scores = score_presidio_recognizer(
        recognizer=CreditCardRecognizer(),
        entities_to_keep=["CREDIT_CARD"],
        input_samples=input_samples,
    )
    assert acceptance_threshold <= scores.pii_f
コード例 #8
0
def test_dataset_to_metric_50_50_model():
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        "{}/data/generated_small.json".format(dir_path), length=100)

    # Replace 50% of the predictions with a list of "O"
    model = FiftyFiftyIdentityTokensMockModel()
    evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
    evaluation_results = evaluator.evaluate_all(input_samples)
    metrics = evaluator.calculate_score(evaluation_results)

    print(metrics.pii_precision)
    print(metrics.pii_recall)
    print(metrics.pii_f)

    assert metrics.pii_precision == 1
    assert metrics.pii_recall < 0.75
    assert metrics.pii_recall > 0.25
コード例 #9
0
def score_presidio_recognizer(
    recognizer: EntityRecognizer,
    entities_to_keep: List[str],
    input_samples: Optional[List[InputSample]] = None,
    labeling_scheme: str = "BILUO",
    with_nlp_artifacts: bool = False,
    verbose: bool = False,
) -> EvaluationResult:
    """
    Run data through one EntityRecognizer and gather results and stats
    """

    if not input_samples:
        print("Reading dataset")
        input_samples = InputSample.read_dataset_json(
            "../../data/synth_dataset_v2.json")
    else:
        input_samples = list(input_samples)

    print(
        "Preparing dataset by aligning entity names to Presidio's entity names"
    )

    updated_samples = Evaluator.align_entity_types(
        input_samples,
        entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map)

    model = PresidioRecognizerWrapper(
        recognizer=recognizer,
        entities_to_keep=entities_to_keep,
        labeling_scheme=labeling_scheme,
        nlp_engine=SpacyNlpEngine(),
        with_nlp_artifacts=with_nlp_artifacts,
    )
    return score_model(
        model=model,
        entities_to_keep=entities_to_keep,
        input_samples=updated_samples,
        verbose=verbose,
    )
コード例 #10
0
def small_dataset():
    dir_path = Path(__file__).parent
    input_samples = InputSample.read_dataset_json(
        Path(dir_path, "data", "generated_small.json"))
    return input_samples
コード例 #11
0
def small_dataset() -> List[InputSample]:
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        os.path.join(dir_path, "data/generated_small.json"))
    return input_samples