def test_credit_card_recognizer_with_template(pii_csv, utterances,
                                              num_of_examples,
                                              acceptance_threshold):
    """
    Test credit card recognizer with a dataset generated from
    template and a CSV values file
    :param pii_csv: input csv file location
    :param utterances: template file location
    :param num_of_examples: number of samples to be used from dataset
    to test
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """

    # read template and CSV files
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))

    # generate examples
    generator = PresidioDataGenerator()
    templates = utterances.format(dir_path)
    examples = generator.generate_fake_data(templates=templates,
                                            n_samples=num_of_examples)
    input_samples = [
        InputSample.from_faker_spans_result(example) for example in examples
    ]

    scores = score_presidio_recognizer(
        recognizer=CreditCardRecognizer(),
        entities_to_keep=["CREDIT_CARD"],
        input_samples=input_samples,
    )
    if not np.isnan(scores.pii_f):
        assert acceptance_threshold <= scores.pii_f
def test_faker_spans_result_to_input_sample(faker_span_result):

    input_sample = InputSample.from_faker_spans_result(
        faker_span_result, create_tags_from_span=False)

    assert input_sample.full_text == "Dan is my name."
    assert input_sample.masked == "{{name}} is my name."
    assert input_sample.spans[0] == Span("name", "Dan", 0, 3)
    assert input_sample.spans[0] == Span("name", "Dan", 0, 3)
def test_faker_spans_to_input_sample_with_tags(faker_span_result):
    input_sample = InputSample.from_faker_spans_result(
        faker_span_result, create_tags_from_span=True, scheme="BILUO")
    assert input_sample.tags
    assert input_sample.tokens
    assert any(["U-name" in tag for tag in input_sample.tags])
def test_pattern_recognizer(
    pii_csv,
    ext_csv,
    utterances,
    entity_name,
    pattern,
    score,
    num_of_examples,
    acceptance_threshold,
    max_mistakes_number,
):
    """
    Test generic pattern recognizer with a dataset generated from template, a CSV values file with common entities
    and another CSV values file with a custom entity
    :param pii_csv: input csv file location with the common entities
    :param ext_csv: input csv file location with custom entities
    :param utterances: template file location
    :param entity_name: custom entity name
    :param pattern: recognizer pattern
    :param num_of_examples: number of samples to be used from dataset to test
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """

    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    dfpii = pd.read_csv(pii_csv.format(dir_path), encoding="utf-8")
    dfext = pd.read_csv(ext_csv.format(dir_path), encoding="utf-8")
    ext_column_name = dfext.columns[0]

    def get_from_ext(i):
        index = i % dfext.shape[0]
        return dfext.iat[index, 0]

    # extend pii with ext data
    dfpii[ext_column_name] = [
        get_from_ext(i) for i in range(0, dfpii.shape[0])
    ]

    # generate examples
    generator = PresidioDataGenerator()
    templates = utterances.format(dir_path)
    examples = generator.generate_fake_data(templates=templates,
                                            n_samples=num_of_examples)
    input_samples = [
        InputSample.from_faker_spans_result(example) for example in examples
    ]

    pattern = Pattern("test pattern", pattern, score)
    pattern_recognizer = PatternRecognizer(entity_name,
                                           name="test recognizer",
                                           patterns=[pattern])

    scores = score_presidio_recognizer(
        recognizer=pattern_recognizer,
        entities_to_keep=[entity_name],
        input_samples=input_samples,
    )
    if not np.isnan(scores.pii_f):
        assert acceptance_threshold <= scores.pii_f
    assert max_mistakes_number >= len(scores.model_errors)