def test_complete_phone_number_v2(self): """ Detection of phone number """ test = "tlf.: ESP 980000000" proximity_dict = { "PHONE": { "left_span_len": 20, "right_span_len": 0, "word_list": ["tlf."] } } ner = ner_regex.RegexNer(regexp_config_dict=proximity_dict) result = ner.regex_detection(test, full_text=test) self.assertTrue( "PHONE" in result, "{} {} {}".format(self.shortDescription(), MSG_PHONE_DETECTED, result))
def test_3_broad_phone_number_v0(self): """ Test the detection of a wrong phone number """ test = "Mi teléfono es 45 988 888 888" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "broad") self.assertTrue( "PHONE" in result, "{} {} {}".format(self.shortDescription(), MSG_PHONE_DETECTED, result)) for i, _regexp in enumerate(result["PHONE"]): if _regexp[1] == "BROAD_REG_PHONE_NUMBER_GEN_V3": idx = i break self.assertEqual( clean_text(result["PHONE"][idx][0].strip()), "988888888", MSG_EXTRACTED.format(self.shortDescription(), MSG_PHONE_NOT_DETECTED, result["PHONE"][idx]))
def test_strict_cif_company(self): """ Test the detection of the CIF of the company """ test = "El CIF de la compañía es A99151276" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "ID_DOCUMENT" in result, "{} {} {}".format(self.shortDescription(), MSG_ID_DOCUMENT_NOT_DETECTED, result)) # search where euro is rules matches the sentence for i in range(len(result["ID_DOCUMENT"])): if result["ID_DOCUMENT"][i][1] == "STRICT_REG_CIF_V0": idx = i self.assertEqual( result["ID_DOCUMENT"][idx][0], "A99151276", MSG_DETECTED.format(self.shortDescription(), MSG_NIF_WRONG_DETECTED, result["ID_DOCUMENT"]))
def test_money_CP_EURO_V0_euros_v3(self): """ Detection of euro currency using the word euro """ test = "este aparato cuesta 1,000.00 Euros" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "MONEY" in result, "{} {} {}".format(self.shortDescription(), MSG_MONEY_NOT_DETECTED, result)) # search where euro is rules matches the sentence for i in range(len(result["MONEY"])): if result["MONEY"][i][1] == "STRICT_REG_EURO_V0": idx = i self.assertEqual( result["MONEY"][idx][0], "1,000.00", MSG_DETECTED.format(self.shortDescription(), MSG_CURRENCY_WRONG_DETECTED, result["MONEY"]))
def test_generic_money_v4(self): """ Detection of money quantities """ test = "el total de la factura es 1.000,000,52." ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "PROB_CURRENCY" in result, "{} {} {}".format(self.shortDescription(), MSG_PROB_CURRENCY_NOT_DETECTED, result)) idx = -1 # search for the rule CP_MONEY_V0 for i in range(len(result["PROB_CURRENCY"])): if result["PROB_CURRENCY"][i][1] == "CP_MONEY_V1": idx = i self.assertEqual( idx, -1, MSG_DETECTED.format(self.shortDescription(), MSG_QUANTITY_DETECTED, result["PROB_CURRENCY"]))
def test_generic_money_v1(self): """ Detection of money quantities """ test = "el total de la factura es 1.000,000." ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "PROB_CURRENCY" in result, "{} {} {}".format(self.shortDescription(), MSG_MONEY_NOT_DETECTED, result)) # search for the rule CP_MONEY_V0 for i in range(len(result["PROB_CURRENCY"])): if result["PROB_CURRENCY"][i][1] == "STRICT_REG_MONEY_V0": idx = i self.assertEqual( result["PROB_CURRENCY"][idx][0], "1.000,000", MSG_DETECTED.format(self.shortDescription(), MSG_CURRENCY_WRONG_DETECTED, result["PROB_CURRENCY"]))
def test_0_broad_phone_number_v0(self): """ Test the detection of a phone number """ test = "Mi teléfono es 988 888 888 " ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "broad") self.assertTrue( "PHONE" in result, "{} {} {}".format(self.shortDescription(), MSG_PHONE_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["PHONE"]): if _regexp[1] == "BROAD_REG_PHONE_NUMBER_APPROX_V3": idx = i break self.assertEqual( clean_text(result["PHONE"][idx][0].strip()), "988888888", "{self.shortDescription()} {MSG_PHONE_NOT_DETECTED}. Extracted {result['PHONE'][idx]}" )
def test_strict_email_v0(self): """ Detection of email v0 rule """ test = "the email of John is [email protected]" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "EMAIL" in result, "{} {} {}".format(self.shortDescription(), MSG_EMAIL_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["EMAIL"]): if _regexp[1] == "STRICT_REG_EMAIL_ADDRESS_V0": idx = i break self.assertEqual( result["EMAIL"][idx][0], "*****@*****.**", MSG_DETECTED.format(self.shortDescription(), MSG_EMAIL_WRONG_DETECTED, result["EMAIL"][idx]))
def test_0_CP_MOBILE_NUMBER_V0(self): """ Test the detection of a phone number """ test = "Mi teléfono móvil es 688 888 888 " ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "broad") self.assertTrue( "MOBILE" in result, "{} {} {}".format(self.shortDescription(), MSG_MOBILE_NOT_DETECTED, result)) for i, _regexp in enumerate(result["MOBILE"]): if _regexp[1] == "BROAD_REG_MOBILE_NUMBER_GEN_V3": idx = i break self.assertEqual( clean_text(result["MOBILE"][idx][0].strip()), "688888888", MSG_EXTRACTED.format(self.shortDescription(), MSG_MOBILE_NOT_DETECTED, result["MOBILE"][idx]))
def test_broad_phone_number_v6(self): """ Detection of phone number """ test = "teléfono: ESP 980000001A" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "broad") self.assertTrue( "PHONE" in result, "{} {} {}".format(self.shortDescription(), MSG_PHONE_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["PHONE"]): if _regexp[1] == "BROAD_REG_PHONE_NUMBER_GEN_V3": idx = i break self.assertEqual( clean_text(result["PHONE"][idx][0]), "980000001", MSG_DETECTED.format(self.shortDescription(), MSG_PHONE_WRONG_DETECTED, result["PHONE"][idx]))
def test_strict_dni_v2(self): """ Detection of DNI""" test = "15373458B" ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "ID_DOCUMENT" in result, "{} {} {}".format(self.shortDescription(), MSG_ID_DOCUMENT_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["ID_DOCUMENT"]): if _regexp[1] == "STRICT_REG_DNI_V0": idx = i break self.assertEqual( clean_text(result["ID_DOCUMENT"][idx][0]), "15373458B", MSG_DETECTED.format(self.shortDescription(), MSG_DNI_WRONG_DETECTED, result["ID_DOCUMENT"][idx]))
def test_dni_with_dash(self): """ Detection of DNI v0 rule with letter separated by dash """ test = "el dni de Juan es 66666666-Y." ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "ID_DOCUMENT" in result, "{} {} {}".format(self.shortDescription(), MSG_ID_DOCUMENT_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["ID_DOCUMENT"]): if _regexp[1] == "STRICT_REG_DNI_V0": idx = i break self.assertEqual( clean_text(result["ID_DOCUMENT"][idx][0]), "66666666Y", MSG_DETECTED.format(self.shortDescription(), MSG_DNI_WRONG_DETECTED, result["ID_DOCUMENT"][idx]))
def test_strict_credit_card_v0(self): """ Detection of card v0 rule """ test = "the visa card is 4111111111111111." ner = ner_regex.RegexNer() result = ner._detect_regexp(test, "strict") self.assertTrue( "CREDIT_CARD" in result, "{} {} {}".format(self.shortDescription(), MSG_CREDIT_CARD_NOT_DETECTED, result)) idx = -1 for i, _regexp in enumerate(result["CREDIT_CARD"]): if _regexp[1] == "STRICT_REG_CREDIT_CARD_V0": idx = i break self.assertEqual( result["CREDIT_CARD"][idx][0], "4111111111111111", MSG_DETECTED.format(self.shortDescription(), MSG_CREDIT_CARD_DETECTED, result["CREDIT_CARD"][idx]))
def test_email_hack_regex(self): """ Test the detection of mail hacks """ test = "Enviar todos vuestros datos a infoAThacktextDOTcom" CP_EMAIL_HACK_V0 = (r"[a-zA-Z0-9_.+-]+\s?(\(|-)?\s?(AT|at)\s?(\)|-)?" + "\s?[a-zA-Z0-9-]+\s?(\(|-)?\s?(DOT|dot)\s" + "?(\)|-)?\s?[a-zA-Z0-9-.]+") HACK_REGEX = {"Email_Hack": [(CP_EMAIL_HACK_V0, "CP_EMAIL_HACK_V0")]} ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_NOT_DETECTED, result, test)) test = "Enviar todos vuestros datos a info AT hacktext DOT com" ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_NOT_DETECTED, result, test)) test = "Enviar todos vuestros datos a info (AT) hacktext (DOT) com" ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_NOT_DETECTED, result, test)) test = "Enviar todos vuestros datos a info-AT-hacktext-DOT-com" ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_NOT_DETECTED, result, test)) test = "Enviar todos vuestros datos a info-at-hacktext-dot-com" ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_NOT_DETECTED, result, test)) test = "Enviar todos vuestros datos a at-dot" ner = ner_regex.RegexNer(strict_regexp_dict=HACK_REGEX) result = ner._detect_regexp(test, "strict") self.assertTrue( "Email_Hack" not in result, MSG_TEXT.format(self.shortDescription(), MSG_EMAIL_HACK_DETECTED, result, test))
def test_regexinit(self): """ Test the initialization of the regex detection class """ ner_regex.RegexNer()