Example #1
0
def test_split_dataset_two_sets():
    sample1 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 1})
    sample2 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 2})
    sample3 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 3})
    sample4 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 4})
    train, test = split_dataset([sample1, sample2, sample3, sample4],
                                [0.5, 0.5])
    assert len(train) == 2
    assert len(test) == 2
Example #2
0
def test_split_dataset_test_with_smallish_ratio():
    sample1 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 1})
    sample2 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 2})
    sample3 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 3})
    sample4 = InputSample("Hi there",
                          masked=None,
                          spans=None,
                          create_tags_from_span=False,
                          metadata={"Template#": 4})
    dataset = [sample1, sample2, sample3, sample4]

    train, test, zero = split_dataset(dataset, [0.5, 0.4999995, 0.0000005])
    assert len(train) == 2
    assert len(test) == 2
    assert len(zero) == 0
 def align_entity_types(self, sample: InputSample) -> None:
     """
     Translates the sample's tags to the ones requested by the model
     :param sample: Input sample
     :return: None
     """
     if self.entity_mapping:
         sample.translate_input_sample_tags(dictionary=self.entity_mapping)
    def predict(self, sample: InputSample) -> List[str]:
        if self.translate_to_spacy_entities:
            sample.translate_input_sample_tags()
        sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer)
        self.model.predict(sentence)

        tags = self.get_tags_from_sentence(sentence)
        if len(tags) != len(sample.tokens):
            print("mismatch between previous tokens and new tokens")
        return tags
Example #5
0
    def predict(self, sample: InputSample) -> List[str]:
        if self.translate_to_spacy_entities:
            sample.translate_input_sample_tags()

        doc = self.model(sample.full_text)
        tags = self.get_tags_from_doc(doc)
        if len(doc) != len(sample.tokens):
            print("mismatch between input tokens and new tokens")

        return tags
def test_to_conll():
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(
        os.path.join(dir_path, "data/generated_small.json"))

    conll = InputSample.create_conll_dataset(input_samples)

    sentences = conll["sentence"].unique()
    assert len(sentences) == len(input_samples)
Example #7
0
def test_evaluate_sample_wrong_entities_to_keep_correct_statistics():
    prediction = ["O", "O", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction,
                            entities_to_keep=['SPACESHIP'])

    sample = InputSample(full_text="I am the walrus",
                         masked="I am the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    assert evaluated.results[("O", "O")] == 4
def mock_8_samples():
    sample1 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample2 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample3 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample4 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample5 = InputSample(
        "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=2
    )
    sample6 = InputSample(
        "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=3
    )
    sample7 = InputSample(
        "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )
    sample8 = InputSample(
        "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )

    return [sample1, sample2, sample3, sample4, sample5, sample6, sample7, sample8]
def test_evaluate_same_entity_correct_statistics():
    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
    sample = InputSample(full_text="I dog the walrus",
                         masked="I [ANIMAL] the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluation_result = evaluator.evaluate_sample(sample, prediction)
    assert evaluation_result.results[("O", "O")] == 2
    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
    assert evaluation_result.results[("O", "ANIMAL")] == 1
def test_evaluate_multiple_tokens_correct_statistics():
    prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"])
    sample = InputSample("I am the walrus amaericanus magnifico",
                         masked=None,
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]

    evaluated = evaluator.evaluate_sample(sample, prediction)
    evaluation = evaluator.calculate_score([evaluated])

    assert evaluation.pii_precision == 1
    assert evaluation.pii_recall == 1
def test_evaluate_multiple_examples_correct_statistics():
    prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"]
    model = MockTokensModel(prediction=prediction)
    evaluator = Evaluator(model=model, entities_to_keep=["PERSON"])
    input_sample = InputSample("My name is Raphael or David",
                               masked=None,
                               spans=None)
    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]

    evaluated = evaluator.evaluate_all(
        [input_sample, input_sample, input_sample, input_sample])
    scores = evaluator.calculate_score(evaluated)
    assert scores.pii_precision == 0.5
    assert scores.pii_recall == 0.5
Example #12
0
def test_evaluator_simple():
    prediction = ["O", "O", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL'])

    sample = InputSample(full_text="I am the walrus",
                         masked="I am the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    final_evaluation = model.calculate_score([evaluated])

    assert final_evaluation.pii_precision == 1
    assert final_evaluation.pii_recall == 1
Example #13
0
def test_evaluate_multiple_entities_to_keep_correct_statistics():
    prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"]
    model = MockTokensModel(prediction=prediction,
                            labeling_scheme='BIO',
                            entities_to_keep=['ANIMAL', 'PLANT', 'SPACESHIP'])
    sample = InputSample(full_text="I dog the walrus",
                         masked="I [ANIMAL] the [ANIMAL]",
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus"]
    sample.tags = ["O", "O", "O", "U-ANIMAL"]

    evaluation_result = model.evaluate_sample(sample)
    assert evaluation_result.results[("O", "O")] == 2
    assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1
    assert evaluation_result.results[("O", "ANIMAL")] == 1
Example #14
0
def test_evaluate_multiple_tokens_no_match_match_correct_statistics():
    prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"]
    model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL'])

    sample = InputSample("I am the walrus amaericanus magnifico",
                         masked=None,
                         spans=None)
    sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"]
    sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"]

    evaluated = model.evaluate_sample(sample)
    evaluation = model.calculate_score([evaluated])

    assert np.isnan(evaluation.pii_precision)
    assert evaluation.pii_recall == 0
def test_analyzer_with_generated_text(test_input, acceptance_threshold):
    """
    Test analyzer with a generated dataset text file
    :param test_input: input text file location
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """
    # read test input from generated file

    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = InputSample.read_dataset_json(test_input.format(dir_path))

    updated_samples = Evaluator.align_entity_types(
        input_samples=input_samples,
        entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map,
    )

    analyzer = PresidioAnalyzerWrapper()
    evaluator = Evaluator(model=analyzer)
    evaluated_samples = evaluator.evaluate_all(updated_samples)
    scores = evaluator.calculate_score(evaluation_results=evaluated_samples)

    assert acceptance_threshold <= scores.pii_precision
    assert acceptance_threshold <= scores.pii_recall
 def predict(self, sample: InputSample) -> List[str]:
     nlpArtifacts = None
     if self.withNlpArtifacts:
         nlpArtifacts = self.__make_nlp_artifacts(sample.full_text)
     results = self.recognizer.analyze(sample.full_text, self.entities, nlpArtifacts)
     starts = []
     ends = []
     tags = []
     scores = []
     for res in results:
         if not res.start:
             res.start = 0
         starts.append(res.start)
         ends.append(res.end)
         tags.append(res.entity_type)
         scores.append(res.score)
     response_tags = span_to_tag(
         scheme=self.labeling_scheme,
         text=sample.full_text,
         start=starts,
         end=ends,
         tag=tags,
         tokens=sample.tokens,
         scores=scores,
         io_tags_only=self.compare_by_io,
     )
     if len(sample.tags) == 0:
         sample.tags = ["0" for word in response_tags]
     return response_tags
    def predict(self, sample: InputSample) -> List[str]:
        """
        Predict the tags using a stanza model.

        :param sample: InputSample with text
        :return: list of tags
        """

        doc = self.model(sample.full_text)
        if doc.ents:
            tags, texts, start, end = zip(*[(s.label_, s.text, s.start_char,
                                             s.end_char) for s in doc.ents])

            # Stanza tokens might not be consistent with spaCy's tokens.
            # Use spacy tokenization and not stanza
            # to maintain consistency with other models:
            if not sample.tokens:
                sample.tokens = tokenize(sample.full_text)

            # Create tags (label per token) based on stanza spans and spacy tokens
            tags = span_to_tag(
                scheme=self.labeling_scheme,
                text=sample.full_text,
                starts=start,
                ends=end,
                tags=tags,
                tokens=sample.tokens,
            )
        else:
            tags = ["O" for _ in range(len(sample.tokens))]

        if len(tags) != len(sample.tokens):
            print("mismatch between input tokens and new tokens")

        return tags
def test_from_spacy_doc():
    nlp = spacy.load("en_core_web_sm")
    doc = nlp("Nice to meet you Mr. Perkins.")

    sample = InputSample.from_spacy_doc(doc)
    assert sample.spans[0].entity_type == "PERSON"
    assert sample.tags == ["O", "O", "O", "O", "O", "U-PERSON", "O"]
def test_split_dataset_test_with_0_ratio():
    sample1 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample2 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2
    )
    sample3 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3
    )
    sample4 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )
    dataset = [sample1, sample2, sample3, sample4]
    with pytest.raises(ValueError):
        train, test, zero = split_dataset(dataset, [0.5, 0.5, 0])
Example #20
0
def test_evaluate_multiple_examples_ignore_entity_correct_statistics():
    prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"]
    model = MockTokensModel(prediction=prediction,
                            labeling_scheme='BILOU',
                            entities_to_keep=['PERSON', 'TENNIS_PLAYER'])
    input_sample = InputSample("My name is Raphael or David",
                               masked=None,
                               spans=None)
    input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"]
    input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"]

    evaluated = model.evaluate_all(
        [input_sample, input_sample, input_sample, input_sample])
    scores = model.calculate_score(evaluated)
    assert scores.pii_precision == 1
    assert scores.pii_recall == 1
def test_split_dataset_two_sets():
    sample1 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1
    )
    sample2 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2
    )
    sample3 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3
    )
    sample4 = InputSample(
        "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4
    )
    train, test = split_dataset([sample1, sample2, sample3, sample4], [0.5, 0.5])
    assert len(train) == 2
    assert len(test) == 2
Example #22
0
    def to_input_samples(self,
                         folder: Optional[str] = None) -> List[InputSample]:
        input_samples = []
        if folder:
            self.files_path = folder
        print(f"Parsing files in {self.files_path}")

        for root, dirs, files in os.walk(self.files_path):
            for file in files:
                spans = []
                filename = os.path.join(root, file)
                xml_content = open(filename, "r").read()

                ordered_dict = xmltodict.parse(xml_content)
                data = dict(ordered_dict['deIdi2b2'])
                text = data['TEXT']
                tags = data['TAGS']
                for item in tags.items():
                    if type(item[1]) is collections.OrderedDict:
                        spans.append(self._create_span(item[1]))
                    else:
                        for sub in item[1]:
                            spans.append(self._create_span(sub))
                input_samples.append(
                    InputSample(full_text=text,
                                spans=spans,
                                create_tags_from_span=True))
        return input_samples
    def predict(self, sample: InputSample) -> List[str]:

        sentence = Sentence(text=sample.full_text,
                            use_tokenizer=self.spacy_tokenizer)
        self.model.predict(sentence)

        ents = sentence.get_spans("ner")
        if ents:
            tags, texts, start, end = zip(*[(ent.tag, ent.text, ent.start_pos,
                                             ent.end_pos) for ent in ents])

            tags = [tag if tag != "PER" else "PERSON"
                    for tag in tags]  # Flair's tag for PERSON is PER

            # Flair tokens might not be consistent with spaCy's tokens (even when using spacy tokenizer)
            # Use spacy tokenization and not stanza to maintain consistency with other models:
            if not sample.tokens:
                sample.tokens = tokenize(sample.full_text)

            # Create tags (label per token) based on stanza spans and spacy tokens
            tags = span_to_tag(
                scheme="IO",
                text=sample.full_text,
                starts=start,
                ends=end,
                tags=tags,
                tokens=sample.tokens,
            )
        else:
            tags = ["O" for _ in range(len(sample.tokens))]

        if len(tags) != len(sample.tokens):
            print("mismatch between input tokens and new tokens")

        return tags
def test_credit_card_recognizer_with_template(pii_csv, utterances,
                                              num_of_examples,
                                              acceptance_threshold):
    """
    Test credit card recognizer with a dataset generated from
    template and a CSV values file
    :param pii_csv: input csv file location
    :param utterances: template file location
    :param num_of_examples: number of samples to be used from dataset
    to test
    :param acceptance_threshold: minimum precision/recall
     allowed for tests to pass
    """

    # read template and CSV files
    import os

    dir_path = os.path.dirname(os.path.realpath(__file__))

    # generate examples
    generator = PresidioDataGenerator()
    templates = utterances.format(dir_path)
    examples = generator.generate_fake_data(templates=templates,
                                            n_samples=num_of_examples)
    input_samples = [
        InputSample.from_faker_spans_result(example) for example in examples
    ]

    scores = score_presidio_recognizer(
        recognizer=CreditCardRecognizer(),
        entities_to_keep=["CREDIT_CARD"],
        input_samples=input_samples,
    )
    if not np.isnan(scores.pii_f):
        assert acceptance_threshold <= scores.pii_f
Example #25
0
def test_align_entity_types(mock_model):
    input_sample = InputSample(full_text="Dan is my name.",
                               spans=[Span("name", "Dan", 0, 3)])

    mock_model.align_entity_types(sample=input_sample)

    assert input_sample.spans[0].entity_type == "new_name"
Example #26
0
    def to_input_samples(self,
                         fold: Optional[str] = None) -> List[InputSample]:
        files_found = False
        input_samples = []
        for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)):
            if fold and fold not in file_path.name:
                continue

            files_found = True
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.readlines()

            text = "".join(text)

            output_docs = conll_ner_to_docs(input_data=text,
                                            n_sents=None,
                                            no_print=True)
            for doc in tqdm(output_docs,
                            f"Processing doc for file {file_path.name}"):
                input_samples.append(InputSample.from_spacy_doc(doc=doc))

        if not files_found:
            raise FileNotFoundError(
                f"No files found for pattern {self.glob_pattern} and fold {fold}"
            )

        return input_samples
Example #27
0
    def create_input_sample(self, original_sentence, values):
        """
        Creates an InputSample out of a template sentence
        and a dict of entity names and values
        :param original_sentence: template (e.g. My name is [FIRST_NAME})
        :param values: Key = entity name, value = entity value
        (e.g. {"TITLE":"Mr."})
        :return: a list of InputSamples
        """
        sentence = original_sentence
        spans = []

        to_lower = random.random() < self.lower_case_ratio

        i = 0
        # replaces placeholders with values and retrieve indices
        while i < len(sentence):
            entity_start = re.search("{", sentence, flags=0)
            if entity_start:
                entity_start = entity_start.start()
            else:
                break
            entity_end = re.search("}", sentence[entity_start:],
                                   flags=0).start() + entity_start
            entity = sentence[entity_start + 1:entity_end]
            entity_value = values[entity]
            entity_value = entity_value.strip()
            # Remove duplicate entity indices:
            entity = ''.join(i for i in entity if not i.isdigit())

            entity_value_len = len(entity_value)
            sentence = sentence[:entity_start] + entity_value + sentence[
                entity_end + 1:]
            # replace a with an if
            if ((sentence[entity_start - 2: entity_start].lower() == "a " and entity_start == 2)
                or (sentence[entity_start - 3: entity_start].lower() == " a ")) \
                    and entity_value[0].lower() in ['a', 'e', 'i', 'o', 'u']:
                sentence = sentence[:entity_start -
                                    1] + "n " + sentence[entity_start:]
                entity_start = entity_start + 1

            if to_lower:
                entity_value = entity_value.lower()

            spans.append(
                Span(entity_type=entity,
                     entity_value=entity_value,
                     start_position=entity_start,
                     end_position=entity_start + entity_value_len))
            i = entity_start + entity_value_len

        if to_lower:
            sentence = sentence.lower()

        # Not creating tokens here since we're consolidating names afterwards
        return InputSample(sentence,
                           original_sentence,
                           spans,
                           create_tags_from_span=False)
Example #28
0
def test_to_spacy_all_entities():
    import os
    dir_path = os.path.dirname(os.path.realpath(__file__))
    input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt"))

    spacy_ver = InputSample.create_spacy_dataset(input_samples)

    assert len(spacy_ver) == len(input_samples)
Example #29
0
    def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path):
        if not path.exists("flair_train.txt"):
            train_samples = read_synth_dataset(train_samples_path)
            train_tagged = [sample for sample in train_samples if len(sample.spans) > 0]
            print("Kept {} train samples after removal of non-tagged samples".format(len(train_tagged)))
            train_data = InputSample.create_conll_dataset(train_tagged)
            self.to_flair(train_data, outfile="flair_train.txt")

        if not path.exists("flair_test.txt"):
            test_samples = read_synth_dataset(test_samples_path)
            test_data = InputSample.create_conll_dataset(test_samples)
            self.to_flair(test_data, outfile="flair_test.txt")

        if not path.exists("flair_val.txt"):
            val_samples = read_synth_dataset(val_samples_path)
            val_data = InputSample.create_conll_dataset(val_samples)
            self.to_flair(val_data, outfile="flair_val.txt")
def test_faker_spans_result_to_input_sample(faker_span_result):

    input_sample = InputSample.from_faker_spans_result(
        faker_span_result, create_tags_from_span=False)

    assert input_sample.full_text == "Dan is my name."
    assert input_sample.masked == "{{name}} is my name."
    assert input_sample.spans[0] == Span("name", "Dan", 0, 3)
    assert input_sample.spans[0] == Span("name", "Dan", 0, 3)