Beispiel #1
0
    def test_complete_phone_number_v2(self):
        """ Detection of phone number """

        test = "tlf.: ESP 980000000"

        proximity_dict = {
            "PHONE": {
                "left_span_len": 20,
                "right_span_len": 0,
                "word_list": ["tlf."]
            }
        }

        ner = ner_regex.RegexNer(regexp_config_dict=proximity_dict)

        result = ner.regex_detection(test, full_text=test)

        self.assertTrue(
            "PHONE" in result, "{} {} {}".format(self.shortDescription(),
                                                 MSG_PHONE_DETECTED, result))
Beispiel #2
0
    def test_3_broad_phone_number_v0(self):
        """ Test the detection of a wrong phone number """

        test = "Mi teléfono es 45 988 888 888"
        ner = ner_regex.RegexNer()

        result = ner._detect_regexp(test, "broad")

        self.assertTrue(
            "PHONE" in result, "{} {} {}".format(self.shortDescription(),
                                                 MSG_PHONE_DETECTED, result))

        for i, _regexp in enumerate(result["PHONE"]):
            if _regexp[1] == "BROAD_REG_PHONE_NUMBER_GEN_V3":
                idx = i
                break

        self.assertEqual(
            clean_text(result["PHONE"][idx][0].strip()), "988888888",
            MSG_EXTRACTED.format(self.shortDescription(),
                                 MSG_PHONE_NOT_DETECTED, result["PHONE"][idx]))
Beispiel #3
0
    def test_strict_cif_company(self):
        """ Test the detection of the CIF of the company """

        test = "El CIF de la compañía es A99151276"

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "ID_DOCUMENT" in result,
            "{} {} {}".format(self.shortDescription(),
                              MSG_ID_DOCUMENT_NOT_DETECTED, result))

        # search where euro is rules matches the sentence
        for i in range(len(result["ID_DOCUMENT"])):
            if result["ID_DOCUMENT"][i][1] == "STRICT_REG_CIF_V0":
                idx = i

        self.assertEqual(
            result["ID_DOCUMENT"][idx][0], "A99151276",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_NIF_WRONG_DETECTED, result["ID_DOCUMENT"]))
Beispiel #4
0
    def test_money_CP_EURO_V0_euros_v3(self):
        """ Detection of euro currency using the word euro """

        test = "este aparato cuesta 1,000.00 Euros"

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "MONEY" in result,
            "{} {} {}".format(self.shortDescription(), MSG_MONEY_NOT_DETECTED,
                              result))

        # search where euro is rules matches the sentence
        for i in range(len(result["MONEY"])):
            if result["MONEY"][i][1] == "STRICT_REG_EURO_V0":
                idx = i

        self.assertEqual(
            result["MONEY"][idx][0], "1,000.00",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_CURRENCY_WRONG_DETECTED, result["MONEY"]))
Beispiel #5
0
    def test_generic_money_v4(self):
        """ Detection of money quantities """

        test = "el total de la factura es 1.000,000,52."

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "PROB_CURRENCY" in result,
            "{} {} {}".format(self.shortDescription(),
                              MSG_PROB_CURRENCY_NOT_DETECTED, result))

        idx = -1
        # search for the rule CP_MONEY_V0
        for i in range(len(result["PROB_CURRENCY"])):
            if result["PROB_CURRENCY"][i][1] == "CP_MONEY_V1":
                idx = i

        self.assertEqual(
            idx, -1,
            MSG_DETECTED.format(self.shortDescription(), MSG_QUANTITY_DETECTED,
                                result["PROB_CURRENCY"]))
Beispiel #6
0
    def test_generic_money_v1(self):
        """ Detection of money quantities """

        test = "el total de la factura es 1.000,000."

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "PROB_CURRENCY" in result,
            "{} {} {}".format(self.shortDescription(), MSG_MONEY_NOT_DETECTED,
                              result))

        # search for the rule CP_MONEY_V0
        for i in range(len(result["PROB_CURRENCY"])):
            if result["PROB_CURRENCY"][i][1] == "STRICT_REG_MONEY_V0":
                idx = i

        self.assertEqual(
            result["PROB_CURRENCY"][idx][0], "1.000,000",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_CURRENCY_WRONG_DETECTED,
                                result["PROB_CURRENCY"]))
Beispiel #7
0
    def test_0_broad_phone_number_v0(self):
        """ Test the detection of a phone number """

        test = "Mi teléfono es 988 888 888 "
        ner = ner_regex.RegexNer()

        result = ner._detect_regexp(test, "broad")

        self.assertTrue(
            "PHONE" in result,
            "{} {} {}".format(self.shortDescription(), MSG_PHONE_NOT_DETECTED,
                              result))

        idx = -1
        for i, _regexp in enumerate(result["PHONE"]):
            if _regexp[1] == "BROAD_REG_PHONE_NUMBER_APPROX_V3":
                idx = i
                break

        self.assertEqual(
            clean_text(result["PHONE"][idx][0].strip()), "988888888",
            "{self.shortDescription()} {MSG_PHONE_NOT_DETECTED}. Extracted {result['PHONE'][idx]}"
        )
Beispiel #8
0
    def test_strict_email_v0(self):
        """ Detection of email v0 rule """

        test = "the email of John is [email protected]"
        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "EMAIL" in result,
            "{} {} {}".format(self.shortDescription(), MSG_EMAIL_NOT_DETECTED,
                              result))

        idx = -1
        for i, _regexp in enumerate(result["EMAIL"]):
            if _regexp[1] == "STRICT_REG_EMAIL_ADDRESS_V0":
                idx = i
                break

        self.assertEqual(
            result["EMAIL"][idx][0], "*****@*****.**",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_EMAIL_WRONG_DETECTED,
                                result["EMAIL"][idx]))
Beispiel #9
0
    def test_0_CP_MOBILE_NUMBER_V0(self):
        """ Test the detection of a phone number """

        test = "Mi teléfono móvil es 688 888 888 "
        ner = ner_regex.RegexNer()

        result = ner._detect_regexp(test, "broad")

        self.assertTrue(
            "MOBILE" in result,
            "{} {} {}".format(self.shortDescription(), MSG_MOBILE_NOT_DETECTED,
                              result))

        for i, _regexp in enumerate(result["MOBILE"]):
            if _regexp[1] == "BROAD_REG_MOBILE_NUMBER_GEN_V3":
                idx = i
                break

        self.assertEqual(
            clean_text(result["MOBILE"][idx][0].strip()), "688888888",
            MSG_EXTRACTED.format(self.shortDescription(),
                                 MSG_MOBILE_NOT_DETECTED,
                                 result["MOBILE"][idx]))
Beispiel #10
0
    def test_broad_phone_number_v6(self):
        """ Detection of phone number """

        test = "teléfono: ESP 980000001A"

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "broad")

        self.assertTrue(
            "PHONE" in result,
            "{} {} {}".format(self.shortDescription(), MSG_PHONE_NOT_DETECTED,
                              result))

        idx = -1
        for i, _regexp in enumerate(result["PHONE"]):
            if _regexp[1] == "BROAD_REG_PHONE_NUMBER_GEN_V3":
                idx = i
                break

        self.assertEqual(
            clean_text(result["PHONE"][idx][0]), "980000001",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_PHONE_WRONG_DETECTED,
                                result["PHONE"][idx]))
Beispiel #11
0
    def test_strict_dni_v2(self):
        """ Detection of DNI"""

        test = "15373458B"

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "ID_DOCUMENT" in result,
            "{} {} {}".format(self.shortDescription(),
                              MSG_ID_DOCUMENT_NOT_DETECTED, result))

        idx = -1
        for i, _regexp in enumerate(result["ID_DOCUMENT"]):
            if _regexp[1] == "STRICT_REG_DNI_V0":
                idx = i
                break

        self.assertEqual(
            clean_text(result["ID_DOCUMENT"][idx][0]), "15373458B",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_DNI_WRONG_DETECTED,
                                result["ID_DOCUMENT"][idx]))
Beispiel #12
0
    def test_dni_with_dash(self):
        """ Detection of DNI v0 rule with letter separated by dash """

        test = "el dni de Juan es 66666666-Y."

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "ID_DOCUMENT" in result,
            "{} {} {}".format(self.shortDescription(),
                              MSG_ID_DOCUMENT_NOT_DETECTED, result))

        idx = -1
        for i, _regexp in enumerate(result["ID_DOCUMENT"]):
            if _regexp[1] == "STRICT_REG_DNI_V0":
                idx = i
                break

        self.assertEqual(
            clean_text(result["ID_DOCUMENT"][idx][0]), "66666666Y",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_DNI_WRONG_DETECTED,
                                result["ID_DOCUMENT"][idx]))
Beispiel #13
0
    def test_strict_credit_card_v0(self):
        """ Detection of card v0 rule """

        test = "the visa card is 4111111111111111."

        ner = ner_regex.RegexNer()
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "CREDIT_CARD" in result,
            "{} {} {}".format(self.shortDescription(),
                              MSG_CREDIT_CARD_NOT_DETECTED, result))

        idx = -1
        for i, _regexp in enumerate(result["CREDIT_CARD"]):
            if _regexp[1] == "STRICT_REG_CREDIT_CARD_V0":
                idx = i
                break

        self.assertEqual(
            result["CREDIT_CARD"][idx][0], "4111111111111111",
            MSG_DETECTED.format(self.shortDescription(),
                                MSG_CREDIT_CARD_DETECTED,
                                result["CREDIT_CARD"][idx]))
Beispiel #14
0
    def test_email_hack_regex(self):
        """ Test the detection of mail hacks """

        test = "Enviar todos vuestros datos a infoAThacktextDOTcom"

        CP_EMAIL_HACK_V0 = (r"[a-zA-Z0-9_.+-]+\s?(\(|-)?\s?(AT|at)\s?(\)|-)?" +
                            "\s?[a-zA-Z0-9-]+\s?(\(|-)?\s?(DOT|dot)\s" +
                            "?(\)|-)?\s?[a-zA-Z0-9-.]+")

        HACK_REGEX = {"Email_Hack": [(CP_EMAIL_HACK_V0, "CP_EMAIL_HACK_V0")]}

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" in result,
            MSG_TEXT.format(self.shortDescription(),
                            MSG_EMAIL_HACK_NOT_DETECTED, result, test))

        test = "Enviar todos vuestros datos a info AT hacktext DOT com"

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" in result,
            MSG_TEXT.format(self.shortDescription(),
                            MSG_EMAIL_HACK_NOT_DETECTED, result, test))

        test = "Enviar todos vuestros datos a info (AT) hacktext (DOT) com"

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" in result,
            MSG_TEXT.format(self.shortDescription(),
                            MSG_EMAIL_HACK_NOT_DETECTED, result, test))

        test = "Enviar todos vuestros datos a info-AT-hacktext-DOT-com"

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" in result,
            MSG_TEXT.format(self.shortDescription(),
                            MSG_EMAIL_HACK_NOT_DETECTED, result, test))

        test = "Enviar todos vuestros datos a info-at-hacktext-dot-com"

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" in result,
            MSG_TEXT.format(self.shortDescription(),
                            MSG_EMAIL_HACK_NOT_DETECTED, result, test))

        test = "Enviar todos vuestros datos a at-dot"

        ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX)
        result = ner._detect_regexp(test, "strict")

        self.assertTrue(
            "Email_Hack" not in result,
            MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_DETECTED,
                            result, test))
Beispiel #15
0
 def test_regexinit(self):
     """ Test the initialization of the regex detection class """
     ner_regex.RegexNer()