def test_split_dataset_two_sets(): sample1 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 1}) sample2 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 2}) sample3 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 3}) sample4 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 4}) train, test = split_dataset([sample1, sample2, sample3, sample4], [0.5, 0.5]) assert len(train) == 2 assert len(test) == 2
def test_split_dataset_test_with_smallish_ratio(): sample1 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 1}) sample2 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 2}) sample3 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 3}) sample4 = InputSample("Hi there", masked=None, spans=None, create_tags_from_span=False, metadata={"Template#": 4}) dataset = [sample1, sample2, sample3, sample4] train, test, zero = split_dataset(dataset, [0.5, 0.4999995, 0.0000005]) assert len(train) == 2 assert len(test) == 2 assert len(zero) == 0
def align_entity_types(self, sample: InputSample) -> None: """ Translates the sample's tags to the ones requested by the model :param sample: Input sample :return: None """ if self.entity_mapping: sample.translate_input_sample_tags(dictionary=self.entity_mapping)
def predict(self, sample: InputSample) -> List[str]: if self.translate_to_spacy_entities: sample.translate_input_sample_tags() sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer) self.model.predict(sentence) tags = self.get_tags_from_sentence(sentence) if len(tags) != len(sample.tokens): print("mismatch between previous tokens and new tokens") return tags
def predict(self, sample: InputSample) -> List[str]: if self.translate_to_spacy_entities: sample.translate_input_sample_tags() doc = self.model(sample.full_text) tags = self.get_tags_from_doc(doc) if len(doc) != len(sample.tokens): print("mismatch between input tokens and new tokens") return tags
def test_to_conll(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json( os.path.join(dir_path, "data/generated_small.json")) conll = InputSample.create_conll_dataset(input_samples) sentences = conll["sentence"].unique() assert len(sentences) == len(input_samples)
def test_evaluate_sample_wrong_entities_to_keep_correct_statistics(): prediction = ["O", "O", "O", "U-ANIMAL"] model = MockTokensModel(prediction=prediction, entities_to_keep=['SPACESHIP']) sample = InputSample(full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluated = model.evaluate_sample(sample) assert evaluated.results[("O", "O")] == 4
def mock_8_samples(): sample1 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample2 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample3 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample4 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample5 = InputSample( "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=2 ) sample6 = InputSample( "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=3 ) sample7 = InputSample( "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=4 ) sample8 = InputSample( "Bye there", masked=None, spans=None, create_tags_from_span=False, template_id=4 ) return [sample1, sample2, sample3, sample4, sample5, sample6, sample7, sample8]
def test_evaluate_same_entity_correct_statistics(): prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) sample = InputSample(full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluation_result = evaluator.evaluate_sample(sample, prediction) assert evaluation_result.results[("O", "O")] == 2 assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 assert evaluation_result.results[("O", "ANIMAL")] == 1
def test_evaluate_multiple_tokens_correct_statistics(): prediction = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["ANIMAL"]) sample = InputSample("I am the walrus amaericanus magnifico", masked=None, spans=None) sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] evaluated = evaluator.evaluate_sample(sample, prediction) evaluation = evaluator.calculate_score([evaluated]) assert evaluation.pii_precision == 1 assert evaluation.pii_recall == 1
def test_evaluate_multiple_examples_correct_statistics(): prediction = ["U-PERSON", "O", "O", "U-PERSON", "O", "O"] model = MockTokensModel(prediction=prediction) evaluator = Evaluator(model=model, entities_to_keep=["PERSON"]) input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] evaluated = evaluator.evaluate_all( [input_sample, input_sample, input_sample, input_sample]) scores = evaluator.calculate_score(evaluated) assert scores.pii_precision == 0.5 assert scores.pii_recall == 0.5
def test_evaluator_simple(): prediction = ["O", "O", "O", "U-ANIMAL"] model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL']) sample = InputSample(full_text="I am the walrus", masked="I am the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluated = model.evaluate_sample(sample) final_evaluation = model.calculate_score([evaluated]) assert final_evaluation.pii_precision == 1 assert final_evaluation.pii_recall == 1
def test_evaluate_multiple_entities_to_keep_correct_statistics(): prediction = ["O", "U-ANIMAL", "O", "U-ANIMAL"] model = MockTokensModel(prediction=prediction, labeling_scheme='BIO', entities_to_keep=['ANIMAL', 'PLANT', 'SPACESHIP']) sample = InputSample(full_text="I dog the walrus", masked="I [ANIMAL] the [ANIMAL]", spans=None) sample.tokens = ["I", "am", "the", "walrus"] sample.tags = ["O", "O", "O", "U-ANIMAL"] evaluation_result = model.evaluate_sample(sample) assert evaluation_result.results[("O", "O")] == 2 assert evaluation_result.results[("ANIMAL", "ANIMAL")] == 1 assert evaluation_result.results[("O", "ANIMAL")] == 1
def test_evaluate_multiple_tokens_no_match_match_correct_statistics(): prediction = ["O", "O", "O", "B-SPACESHIP", "L-SPACESHIP", "O"] model = MockTokensModel(prediction=prediction, entities_to_keep=['ANIMAL']) sample = InputSample("I am the walrus amaericanus magnifico", masked=None, spans=None) sample.tokens = ["I", "am", "the", "walrus", "americanus", "magnifico"] sample.tags = ["O", "O", "O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL"] evaluated = model.evaluate_sample(sample) evaluation = model.calculate_score([evaluated]) assert np.isnan(evaluation.pii_precision) assert evaluation.pii_recall == 0
def test_analyzer_with_generated_text(test_input, acceptance_threshold): """ Test analyzer with a generated dataset text file :param test_input: input text file location :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ # read test input from generated file import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = InputSample.read_dataset_json(test_input.format(dir_path)) updated_samples = Evaluator.align_entity_types( input_samples=input_samples, entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map, ) analyzer = PresidioAnalyzerWrapper() evaluator = Evaluator(model=analyzer) evaluated_samples = evaluator.evaluate_all(updated_samples) scores = evaluator.calculate_score(evaluation_results=evaluated_samples) assert acceptance_threshold <= scores.pii_precision assert acceptance_threshold <= scores.pii_recall
def predict(self, sample: InputSample) -> List[str]: nlpArtifacts = None if self.withNlpArtifacts: nlpArtifacts = self.__make_nlp_artifacts(sample.full_text) results = self.recognizer.analyze(sample.full_text, self.entities, nlpArtifacts) starts = [] ends = [] tags = [] scores = [] for res in results: if not res.start: res.start = 0 starts.append(res.start) ends.append(res.end) tags.append(res.entity_type) scores.append(res.score) response_tags = span_to_tag( scheme=self.labeling_scheme, text=sample.full_text, start=starts, end=ends, tag=tags, tokens=sample.tokens, scores=scores, io_tags_only=self.compare_by_io, ) if len(sample.tags) == 0: sample.tags = ["0" for word in response_tags] return response_tags
def predict(self, sample: InputSample) -> List[str]: """ Predict the tags using a stanza model. :param sample: InputSample with text :return: list of tags """ doc = self.model(sample.full_text) if doc.ents: tags, texts, start, end = zip(*[(s.label_, s.text, s.start_char, s.end_char) for s in doc.ents]) # Stanza tokens might not be consistent with spaCy's tokens. # Use spacy tokenization and not stanza # to maintain consistency with other models: if not sample.tokens: sample.tokens = tokenize(sample.full_text) # Create tags (label per token) based on stanza spans and spacy tokens tags = span_to_tag( scheme=self.labeling_scheme, text=sample.full_text, starts=start, ends=end, tags=tags, tokens=sample.tokens, ) else: tags = ["O" for _ in range(len(sample.tokens))] if len(tags) != len(sample.tokens): print("mismatch between input tokens and new tokens") return tags
def test_from_spacy_doc(): nlp = spacy.load("en_core_web_sm") doc = nlp("Nice to meet you Mr. Perkins.") sample = InputSample.from_spacy_doc(doc) assert sample.spans[0].entity_type == "PERSON" assert sample.tags == ["O", "O", "O", "O", "O", "U-PERSON", "O"]
def test_split_dataset_test_with_0_ratio(): sample1 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample2 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2 ) sample3 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3 ) sample4 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4 ) dataset = [sample1, sample2, sample3, sample4] with pytest.raises(ValueError): train, test, zero = split_dataset(dataset, [0.5, 0.5, 0])
def test_evaluate_multiple_examples_ignore_entity_correct_statistics(): prediction = ["O", "O", "O", "U-PERSON", "O", "U-TENNIS_PLAYER"] model = MockTokensModel(prediction=prediction, labeling_scheme='BILOU', entities_to_keep=['PERSON', 'TENNIS_PLAYER']) input_sample = InputSample("My name is Raphael or David", masked=None, spans=None) input_sample.tokens = ["My", "name", "is", "Raphael", "or", "David"] input_sample.tags = ["O", "O", "O", "U-PERSON", "O", "U-PERSON"] evaluated = model.evaluate_all( [input_sample, input_sample, input_sample, input_sample]) scores = model.calculate_score(evaluated) assert scores.pii_precision == 1 assert scores.pii_recall == 1
def test_split_dataset_two_sets(): sample1 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=1 ) sample2 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=2 ) sample3 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=3 ) sample4 = InputSample( "Hi there", masked=None, spans=None, create_tags_from_span=False, template_id=4 ) train, test = split_dataset([sample1, sample2, sample3, sample4], [0.5, 0.5]) assert len(train) == 2 assert len(test) == 2
def to_input_samples(self, folder: Optional[str] = None) -> List[InputSample]: input_samples = [] if folder: self.files_path = folder print(f"Parsing files in {self.files_path}") for root, dirs, files in os.walk(self.files_path): for file in files: spans = [] filename = os.path.join(root, file) xml_content = open(filename, "r").read() ordered_dict = xmltodict.parse(xml_content) data = dict(ordered_dict['deIdi2b2']) text = data['TEXT'] tags = data['TAGS'] for item in tags.items(): if type(item[1]) is collections.OrderedDict: spans.append(self._create_span(item[1])) else: for sub in item[1]: spans.append(self._create_span(sub)) input_samples.append( InputSample(full_text=text, spans=spans, create_tags_from_span=True)) return input_samples
def predict(self, sample: InputSample) -> List[str]: sentence = Sentence(text=sample.full_text, use_tokenizer=self.spacy_tokenizer) self.model.predict(sentence) ents = sentence.get_spans("ner") if ents: tags, texts, start, end = zip(*[(ent.tag, ent.text, ent.start_pos, ent.end_pos) for ent in ents]) tags = [tag if tag != "PER" else "PERSON" for tag in tags] # Flair's tag for PERSON is PER # Flair tokens might not be consistent with spaCy's tokens (even when using spacy tokenizer) # Use spacy tokenization and not stanza to maintain consistency with other models: if not sample.tokens: sample.tokens = tokenize(sample.full_text) # Create tags (label per token) based on stanza spans and spacy tokens tags = span_to_tag( scheme="IO", text=sample.full_text, starts=start, ends=end, tags=tags, tokens=sample.tokens, ) else: tags = ["O" for _ in range(len(sample.tokens))] if len(tags) != len(sample.tokens): print("mismatch between input tokens and new tokens") return tags
def test_credit_card_recognizer_with_template(pii_csv, utterances, num_of_examples, acceptance_threshold): """ Test credit card recognizer with a dataset generated from template and a CSV values file :param pii_csv: input csv file location :param utterances: template file location :param num_of_examples: number of samples to be used from dataset to test :param acceptance_threshold: minimum precision/recall allowed for tests to pass """ # read template and CSV files import os dir_path = os.path.dirname(os.path.realpath(__file__)) # generate examples generator = PresidioDataGenerator() templates = utterances.format(dir_path) examples = generator.generate_fake_data(templates=templates, n_samples=num_of_examples) input_samples = [ InputSample.from_faker_spans_result(example) for example in examples ] scores = score_presidio_recognizer( recognizer=CreditCardRecognizer(), entities_to_keep=["CREDIT_CARD"], input_samples=input_samples, ) if not np.isnan(scores.pii_f): assert acceptance_threshold <= scores.pii_f
def test_align_entity_types(mock_model): input_sample = InputSample(full_text="Dan is my name.", spans=[Span("name", "Dan", 0, 3)]) mock_model.align_entity_types(sample=input_sample) assert input_sample.spans[0].entity_type == "new_name"
def to_input_samples(self, fold: Optional[str] = None) -> List[InputSample]: files_found = False input_samples = [] for i, file_path in enumerate(self.files_path.glob(self.glob_pattern)): if fold and fold not in file_path.name: continue files_found = True with open(file_path, "r", encoding="utf-8") as file: text = file.readlines() text = "".join(text) output_docs = conll_ner_to_docs(input_data=text, n_sents=None, no_print=True) for doc in tqdm(output_docs, f"Processing doc for file {file_path.name}"): input_samples.append(InputSample.from_spacy_doc(doc=doc)) if not files_found: raise FileNotFoundError( f"No files found for pattern {self.glob_pattern} and fold {fold}" ) return input_samples
def create_input_sample(self, original_sentence, values): """ Creates an InputSample out of a template sentence and a dict of entity names and values :param original_sentence: template (e.g. My name is [FIRST_NAME}) :param values: Key = entity name, value = entity value (e.g. {"TITLE":"Mr."}) :return: a list of InputSamples """ sentence = original_sentence spans = [] to_lower = random.random() < self.lower_case_ratio i = 0 # replaces placeholders with values and retrieve indices while i < len(sentence): entity_start = re.search("{", sentence, flags=0) if entity_start: entity_start = entity_start.start() else: break entity_end = re.search("}", sentence[entity_start:], flags=0).start() + entity_start entity = sentence[entity_start + 1:entity_end] entity_value = values[entity] entity_value = entity_value.strip() # Remove duplicate entity indices: entity = ''.join(i for i in entity if not i.isdigit()) entity_value_len = len(entity_value) sentence = sentence[:entity_start] + entity_value + sentence[ entity_end + 1:] # replace a with an if if ((sentence[entity_start - 2: entity_start].lower() == "a " and entity_start == 2) or (sentence[entity_start - 3: entity_start].lower() == " a ")) \ and entity_value[0].lower() in ['a', 'e', 'i', 'o', 'u']: sentence = sentence[:entity_start - 1] + "n " + sentence[entity_start:] entity_start = entity_start + 1 if to_lower: entity_value = entity_value.lower() spans.append( Span(entity_type=entity, entity_value=entity_value, start_position=entity_start, end_position=entity_start + entity_value_len)) i = entity_start + entity_value_len if to_lower: sentence = sentence.lower() # Not creating tokens here since we're consolidating names afterwards return InputSample(sentence, original_sentence, spans, create_tags_from_span=False)
def test_to_spacy_all_entities(): import os dir_path = os.path.dirname(os.path.realpath(__file__)) input_samples = read_synth_dataset(os.path.join(dir_path, "data/generated_small.txt")) spacy_ver = InputSample.create_spacy_dataset(input_samples) assert len(spacy_ver) == len(input_samples)
def create_flair_corpus(self, train_samples_path, test_samples_path, val_samples_path): if not path.exists("flair_train.txt"): train_samples = read_synth_dataset(train_samples_path) train_tagged = [sample for sample in train_samples if len(sample.spans) > 0] print("Kept {} train samples after removal of non-tagged samples".format(len(train_tagged))) train_data = InputSample.create_conll_dataset(train_tagged) self.to_flair(train_data, outfile="flair_train.txt") if not path.exists("flair_test.txt"): test_samples = read_synth_dataset(test_samples_path) test_data = InputSample.create_conll_dataset(test_samples) self.to_flair(test_data, outfile="flair_test.txt") if not path.exists("flair_val.txt"): val_samples = read_synth_dataset(val_samples_path) val_data = InputSample.create_conll_dataset(val_samples) self.to_flair(val_data, outfile="flair_val.txt")
def test_faker_spans_result_to_input_sample(faker_span_result): input_sample = InputSample.from_faker_spans_result( faker_span_result, create_tags_from_span=False) assert input_sample.full_text == "Dan is my name." assert input_sample.masked == "{{name}} is my name." assert input_sample.spans[0] == Span("name", "Dan", 0, 3) assert input_sample.spans[0] == Span("name", "Dan", 0, 3)