コード例 #1
0
class SgFinRecognizer(PatternRecognizer):
    """
    Recognize SG FIN/NRIC number using regex.

    :param patterns: List of patterns to be used by this recognizer
    :param context: List of context words to increase confidence in detection
    :param supported_language: Language this recognizer supports
    :param supported_entity: The entity this recognizer can detect
    """

    PATTERNS = [
        Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3),
        Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5),
    ]

    CONTEXT = ["fin", "fin#", "nric", "nric#"]

    def __init__(
        self,
        patterns: Optional[List[Pattern]] = None,
        context: Optional[List[str]] = None,
        supported_language: str = "en",
        supported_entity: str = "SG_NRIC_FIN",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
class SgFinRecognizer(PatternRecognizer):
    """
    Recognizes SG FIN/NRIC number using regex
    """

    PATTERNS = [
        Pattern("Nric (weak)", r"(?i)(\b[A-Z][0-9]{7}[A-Z]\b)", 0.3),
        Pattern("Nric (medium)", r"(?i)(\b[STFG][0-9]{7}[A-Z]\b)", 0.5),
    ]

    CONTEXT = ["fin", "fin#", "nric", "nric#"]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="SG_NRIC_FIN",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
コード例 #3
0
class IpRecognizer(PatternRecognizer):
    """
    Recognizes IP address using regex
    """

    PATTERNS = [
        Pattern(
            "IPv4",
            r"\b(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",  # noqa: E501
            0.6,
        ),
        Pattern(
            "IPv6",
            r"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",  # noqa: E501
            0.6,
        ),
    ]

    CONTEXT = ["ip", "ipv4", "ipv6"]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="IP_ADDRESS",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
class UsPhoneRecognizer(PatternRecognizer):
    """
    Recognizes US Phone numbers using regex
    """

    PATTERNS = [
        Pattern(
            "Phone (strong)",
            r"(\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|d{3}[-\.\s]\d{3}[-\.\s]\\d{4})",
            0.7,
        ),
        Pattern("Phone (medium)", r"\b(\d{3}[-\.\s]\d{3}[-\.\s]??\d{4})\b",
                0.5),
        Pattern("Phone (weak)", r"(\b\d{10}\b)", 0.05),
    ]

    # pylint: disable=line-too-long,abstract-method
    CONTEXT = ["phone", "number", "telephone", "cell", "mobile", "call"]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="PHONE_NUMBER",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
コード例 #5
0
 def __init__(self):
     patterns = [
         Pattern('IPv4', IP_V4_REGEX, 0.6),
         Pattern('IPv6', IP_V6_REGEX, 0.6)
     ]
     super().__init__(supported_entity="IP_ADDRESS",
                      patterns=patterns,
                      context=IP_CONTEXT)
コード例 #6
0
 def __init__(self):
     patterns = [
         Pattern('Nric (weak) ', WEAK_REGEX, 0.3),
         Pattern('Nric (medium) ', MEDIUM_REGEX, 0.5),
     ]
     super().__init__(supported_entity="SG_NRIC_FIN",
                      patterns=patterns,
                      context=CONTEXT)
コード例 #7
0
 def __init__(self):
     patterns = [Pattern('Driver License - WA (weak) ', WA_WEAK_REGEX, 0.4),
                 Pattern('Driver License - Alphanumeric (weak) ',
                         ALPHANUMERIC_REGEX, 0.3),
                 Pattern('Driver License - Digits (very weak)',
                         DIGITS_REGEX, 0.01)]
     super().__init__(supported_entity="US_DRIVER_LICENSE",
                      patterns=patterns, context=LICENSE_CONTEXT)
コード例 #8
0
 def __init__(self):
     patterns = [
         Pattern('Itin (very weak)', VERY_WEAK_REGEX, 0.05),
         Pattern('Itin (weak)', WEAK_REGEX, 0.3),
         Pattern('Itin (medium)', MEDIUM_REGEX, 0.5)
     ]
     super().__init__(supported_entity="US_ITIN",
                      patterns=patterns,
                      context=CONTEXT)
コード例 #9
0
class UsLicenseRecognizer(PatternRecognizer):
    """
    Recognizes US driver license using regex.

    :param patterns: List of patterns to be used by this recognizer
    :param context: List of context words to increase confidence in detection
    :param supported_language: Language this recognizer supports
    :param supported_entity: The entity this recognizer can detect
    """

    PATTERNS = [
        Pattern(
            "Driver License - WA (weak)",
            r"\b((?=.*\d)([A-Z][A-Z0-9*]{11})|(?=.*\*)([A-Z][A-Z0-9*]{11}))\b",  # noqa: E501
            0.4,
        ),
        Pattern(
            "Driver License - Alphanumeric (weak)",
            r"\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\b",  # noqa: E501
            0.3,
        ),
        Pattern(
            "Driver License - Digits (very weak)",
            r"\b([0-9]{6,14}|[0-9]{16})\b",  # noqa: E501
            0.01,
        ),
    ]

    CONTEXT = [
        "driver",
        "license",
        "permit",
        "lic",
        "identification",
        "dl",
        "dls",
        "cdls",
        "id",
        "lic#",
        "driving",
    ]

    def __init__(
        self,
        patterns: Optional[List[Pattern]] = None,
        context: Optional[List[str]] = None,
        supported_language: str = "en",
        supported_entity: str = "US_DRIVER_LICENSE",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            supported_language=supported_language,
            patterns=patterns,
            context=context,
        )
コード例 #10
0
def test_no_entity_for_pattern_recognizer():
    with pytest.raises(ValueError):
        patterns = [
            Pattern("p1", "someregex", 1.0),
            Pattern("p1", "someregex", 0.5)
        ]
        MockRecognizer(entity=[],
                       patterns=patterns,
                       black_list=[],
                       name=None,
                       context=None)
コード例 #11
0
 def __init__(self):
     patterns = [
         Pattern('Phone (strong)', UsPhoneRecognizer.STRONG_REGEX,
                 UsPhoneRecognizer.STRONG_REGEX_SCORE),
         Pattern('Phone (medium)', UsPhoneRecognizer.MEDIUM_REGEX,
                 UsPhoneRecognizer.MEDIUM_REGEX_SCORE),
         Pattern('Phone (weak)', UsPhoneRecognizer.WEAK_REGEX,
                 UsPhoneRecognizer.WEAK_REGEX_SCORE)
     ]
     super().__init__(supported_entity="PHONE_NUMBER",
                      patterns=patterns,
                      context=CONTEXT)
コード例 #12
0
class UsLicenseRecognizer(PatternRecognizer):
    """
    Recognizes US driver license using regex
    """

    PATTERNS = [
        Pattern(
            "Driver License - WA (weak)",
            r"\b((?=.*\d)([A-Z][A-Z0-9*]{11})|(?=.*\*)([A-Z][A-Z0-9*]{11}))\b",  # noqa: E501
            0.4,
        ),
        Pattern(
            "Driver License - Alphanumeric (weak)",
            r"\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\b",  # noqa: E501
            0.3,
        ),
        Pattern(
            "Driver License - Digits (very weak)",
            r"\b([0-9]{6,14}|[0-9]{16})\b",  # noqa: E501
            0.01,
        ),
    ]

    CONTEXT = [
        "driver",
        "license",
        "permit",
        "lic",
        "identification",
        "dl",
        "dls",
        "cdls",
        "id",
        "lic#",
        "driving",
    ]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="US_DRIVER_LICENSE",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            supported_language=supported_language,
            patterns=patterns,
            context=context,
        )
コード例 #13
0
    def test_remove_pattern_recognizer(self):
        pattern = Pattern("spaceship pattern", r'\W*(spaceship)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("SPACESHIP",
                                               name="Spaceship recognizer",
                                               patterns=[pattern])
        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        recognizer_registry = RecognizerRegistry(recognizers_store_api_mock)

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Expects one custom recognizer
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 1

        # Remove recognizer
        recognizers_store_api_mock.remove_recognizer("Spaceship recognizer")

        # Expects zero custom recognizers
        recognizers = recognizer_registry.get_custom_recognizers()
        assert len(recognizers) == 0
コード例 #14
0
class DomainRecognizer(PatternRecognizer):
    """
    Recognizes domain names using regex
    """

    # pylint: disable=line-too-long
    PATTERNS = [
        Pattern(
            "Domain ()",
            r"\b(((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,86}[a-zA-Z0-9]))\.(([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,73}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25})))|((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,162}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25}))))\b",  # noqa: E501'  # noqa: E501
            0.5,
        ),
    ]

    CONTEXT = ["domain", "ip"]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="DOMAIN_NAME",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )

    def validate_result(self, pattern_text):
        result = tldextract.extract(pattern_text)
        return result.fqdn != ""
コード例 #15
0
ファイル: iban_recognizer.py プロジェクト: yiliaofan/presidio
 def __init__(self):
     patterns = [
         Pattern('IBAN Generic', IBAN_GENERIC_REGEX, IBAN_GENERIC_SCORE)
     ]
     super().__init__(supported_entity="IBAN_CODE",
                      patterns=patterns,
                      context=CONTEXT)
コード例 #16
0
    def get_all_recognizers(self):
        """
        Returns a list of CustomRecognizer which were created from the
        recognizers stored in the underlying store
        """
        req = recognizers_store_pb2.RecognizersGetAllRequest()
        raw_recognizers = []

        try:
            raw_recognizers = self.rs_stub.ApplyGetAll(req).recognizers

        except grpc.RpcError:
            logger.info("Failed getting recognizers from the remote store. \
            Returning an empty list")
            return raw_recognizers

        custom_recognizers = []
        for new_recognizer in raw_recognizers:
            patterns = []
            for pat in new_recognizer.patterns:
                patterns.extend([Pattern(pat.name, pat.regex, pat.score)])
            new_custom_recognizer = PatternRecognizer(
                name=new_recognizer.name,
                supported_entity=new_recognizer.entity,
                supported_language=new_recognizer.language,
                black_list=new_recognizer.blacklist,
                context=new_recognizer.contextPhrases,
                patterns=patterns)
            custom_recognizers.append(new_custom_recognizer)

        return custom_recognizers
class UsPassportRecognizer(PatternRecognizer):
    """
    Recognizes US Passport number using regex
    """

    # pylint: disable=line-too-long,abstract-method
    # Weak pattern: all passport numbers are a weak match, e.g., 14019033
    PATTERNS = [
        Pattern("Passport (very weak)", r"(\b[0-9]{9}\b)", 0.05),
    ]
    CONTEXT = [
        "us", "united", "states", "passport", "passport#", "travel", "document"
    ]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="US_PASSPORT",
    ):
        context = context if context else self.CONTEXT
        patterns = patterns if patterns else self.PATTERNS
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
    def from_dict(cls, entity_recognizer_dict):
        patterns = entity_recognizer_dict.get("patterns")
        if patterns:
            patterns_list = [Pattern.from_dict(pat) for pat in patterns]
            entity_recognizer_dict['patterns'] = patterns_list

        return cls(**entity_recognizer_dict)
コード例 #19
0
def test_when_context_custom_recognizer_then_succeed(nlp_engine,
                                                     mock_nlp_artifacts):
    """This test checks that a custom recognizer is also enhanced by context.

    However this test also verifies a specific case in which the pattern also
    includes a preceeding space (' rocket'). This in turn cause for a misalignment
    between the tokens and the regex match (the token will be just 'rocket').
    This misalignment is handled in order to find the correct context window.
    """
    rocket_recognizer = PatternRecognizer(
        supported_entity="ROCKET",
        name="rocketrecognizer",
        context=["cool"],
        patterns=[Pattern("rocketpattern", r"\\s+(rocket)", 0.3)],
    )
    text = "hi, this is a cool ROCKET"
    recognizer = rocket_recognizer
    entities = ["ROCKET"]
    nlp_artifacts = nlp_engine.process_text(text, "en")
    results_without_context = recognizer.analyze(text, entities,
                                                 mock_nlp_artifacts)
    results_with_context = recognizer.analyze(text, entities, nlp_artifacts)
    assert len(results_without_context) == len(results_with_context)
    for res_wo, res_w in zip(results_without_context, results_with_context):
        assert res_wo.score < res_w.score
コード例 #20
0
class UsPassportRecognizer(PatternRecognizer):
    """
    Recognizes US Passport number using regex.

    :param patterns: List of patterns to be used by this recognizer
    :param context: List of context words to increase confidence in detection
    :param supported_language: Language this recognizer supports
    :param supported_entity: The entity this recognizer can detect
    """

    # Weak pattern: all passport numbers are a weak match, e.g., 14019033
    PATTERNS = [
        Pattern("Passport (very weak)", r"(\b[0-9]{9}\b)", 0.05),
    ]
    CONTEXT = [
        "us", "united", "states", "passport", "passport#", "travel", "document"
    ]

    def __init__(
        self,
        patterns: Optional[List[Pattern]] = None,
        context: Optional[List[str]] = None,
        supported_language: str = "en",
        supported_entity: str = "US_PASSPORT",
    ):
        context = context if context else self.CONTEXT
        patterns = patterns if patterns else self.PATTERNS
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
class EmailRecognizer(PatternRecognizer):
    """
    Recognizes email addresses using regex
    """

    PATTERNS = [
        Pattern(
            "Email (Medium)",
            r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b",  # noqa: E501
            0.5,
        ),
    ]

    CONTEXT = ["email"]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="EMAIL_ADDRESS",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )

    def validate_result(self, pattern_text):
        result = tldextract.extract(pattern_text)
        return result.fqdn != ""
コード例 #22
0
def create_mock_pattern_recognizer(lang, entity, name):
    return PatternRecognizer(
        supported_entity=entity,
        supported_language=lang,
        name=name,
        patterns=[Pattern("pat", regex="REGEX", score=1.0)],
    )
コード例 #23
0
ファイル: test_pattern.py プロジェクト: yiliaofan/presidio
    def test_from_dict(self):
        expected = my_pattern
        actual = Pattern.from_dict(my_pattern_dict)

        assert expected.name == actual.name
        assert expected.score == actual.score
        assert expected.regex == actual.regex
コード例 #24
0
    def test_added_pattern_recognizer_works(self):
        pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8)
        pattern_recognizer = PatternRecognizer("ROCKET",
                                               name="Rocket recognizer",
                                               patterns=[pattern])

        # Make sure the analyzer doesn't get this entity
        recognizers_store_api_mock = RecognizerStoreApiMock()
        analyze_engine = AnalyzerEngine(
            registry=MockRecognizerRegistry(recognizers_store_api_mock),
            nlp_engine=MockNlpEngine())
        text = "rocket is my favorite transportation"
        entities = ["CREDIT_CARD", "ROCKET"]

        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 0

        # Add a new recognizer for the word "rocket" (case insensitive)
        recognizers_store_api_mock.add_custom_pattern_recognizer(
            pattern_recognizer)

        # Check that the entity is recognized:
        results = analyze_engine.analyze(self.unit_test_guid,
                                         text=text,
                                         entities=entities,
                                         language='en',
                                         all_fields=False)

        assert len(results) == 1
        assert_result(results[0], "ROCKET", 0, 7, 0.8)
コード例 #25
0
ファイル: conftest.py プロジェクト: zhangabner/presidio
def zip_code_deny_list_recognizer():
    regex = r"(\b\d{5}(?:\-\d{4})?\b)"
    zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01)
    zip_recognizer = PatternRecognizer(supported_entity="ZIP",
                                       deny_list=["999"],
                                       patterns=[zipcode_pattern])
    return zip_recognizer
コード例 #26
0
class UsItinRecognizer(PatternRecognizer):
    """
    Recognizes US ITIN (Individual Taxpayer Identification Number) using regex.

    :param patterns: List of patterns to be used by this recognizer
    :param context: List of context words to increase confidence in detection
    :param supported_language: Language this recognizer supports
    :param supported_entity: The entity this recognizer can detect
    """

    PATTERNS = [
        Pattern(
            "Itin (very weak)",
            r"(\b(9\d{2})[- ]{1}((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))(\d{4})\b)|(\b(9\d{2})((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))[- ]{1}(\d{4})\b)",  # noqa: E501
            0.05,
        ),
        Pattern(
            "Itin (weak)",
            r"\b(9\d{2})((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))(\d{4})\b",  # noqa: E501
            0.3,
        ),
        Pattern(
            "Itin (medium)",
            r"\b(9\d{2})[- ]{1}((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))[- ]{1}(\d{4})\b",  # noqa: E501
            0.5,
        ),
    ]

    CONTEXT = [
        "individual", "taxpayer", "itin", "tax", "payer", "taxid", "tin"
    ]

    def __init__(
        self,
        patterns: Optional[List[Pattern]] = None,
        context: Optional[List[str]] = None,
        supported_language: str = "en",
        supported_entity: str = "US_ITIN",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
コード例 #27
0
    def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer":
        """Create instance from a serialized dict."""
        patterns = entity_recognizer_dict.get("patterns")
        if patterns:
            patterns_list = [Pattern.from_dict(pat) for pat in patterns]
            entity_recognizer_dict["patterns"] = patterns_list

        return cls(**entity_recognizer_dict)
    def __black_list_to_regex(black_list):
        """
        Converts a list of word to a matching regex, to be analyzed by the
         regex engine as a part of the analyze logic

        :param black_list: the list of words to detect
        :return:the regex of the words for detection
        """
        regex = r"(?:^|(?<= ))(" + '|'.join(black_list) + r")(?:(?= )|$)"
        return Pattern(name="black_list", regex=regex, score=1.0)
class UsItinRecognizer(PatternRecognizer):
    """
    Recognizes US ITIN (Individual Taxpayer Identification Number) using regex
    """

    # pylint: disable=line-too-long,abstract-method
    PATTERNS = [
        Pattern(
            "Itin (very weak)",
            r"(\b(9\d{2})[- ]{1}((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))(\d{4})\b)|(\b(9\d{2})((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))[- ]{1}(\d{4})\b)",  # noqa: E501
            0.05,
        ),
        Pattern(
            "Itin (weak)",
            r"\b(9\d{2})((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))(\d{4})\b",  # noqa: E501
            0.3,
        ),
        Pattern(
            "Itin (medium)",
            r"\b(9\d{2})[- ]{1}((7[0-9]{1}|8[0-8]{1})|(9[0-2]{1})|(9[4-9]{1}))[- ]{1}(\d{4})\b",  # noqa: E501
            0.5,
        ),
    ]

    CONTEXT = [
        "individual", "taxpayer", "itin", "tax", "payer", "taxid", "tin"
    ]

    def __init__(
        self,
        patterns=None,
        context=None,
        supported_language="en",
        supported_entity="US_ITIN",
    ):
        patterns = patterns if patterns else self.PATTERNS
        context = context if context else self.CONTEXT
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )
コード例 #30
0
class EsNifRecognizer(PatternRecognizer):
    """
    Recognize NIF number using regex and checksum.

    :param patterns: List of patterns to be used by this recognizer
    :param context: List of context words to increase confidence in detection
    :param supported_language: Language this recognizer supports
    :param supported_entity: The entity this recognizer can detect
    :param replacement_pairs: List of tuples with potential replacement values
    for different strings to be used during pattern matching.
    This can allow a greater variety in input, for example by removing dashes or spaces.
    """

    PATTERNS = [
        Pattern(
            "NIF",
            r"\b[0-9]?[0-9]{7}[-]?[A-Z]\b",
            0.5,
        ),
    ]

    CONTEXT = [
        "documento nacional de identidad", "DNI", "NIF", "identificación"
    ]

    def __init__(
        self,
        patterns: Optional[List[Pattern]] = None,
        context: Optional[List[str]] = None,
        supported_language: str = "es",
        supported_entity: str = "ES_NIF",
        replacement_pairs: Optional[List[Tuple[str, str]]] = None,
    ):
        self.replacement_pairs = (replacement_pairs
                                  if replacement_pairs else [("-", ""),
                                                             (" ", "")])
        context = context if context else self.CONTEXT
        patterns = patterns if patterns else self.PATTERNS
        super().__init__(
            supported_entity=supported_entity,
            patterns=patterns,
            context=context,
            supported_language=supported_language,
        )

    def validate_result(self, pattern_text: str) -> bool:  # noqa D102
        pattern_text = EsNifRecognizer.__sanitize_value(pattern_text)
        letter = pattern_text[-1]
        number = int("".join(filter(str.isdigit, pattern_text)))
        letters = "TRWAGMYFPDXBNJZSQVHLCKE"
        return letter == letters[number % 23]

    @staticmethod
    def __sanitize_value(text: str) -> str:
        return text.replace("-", "").replace(" ", "")