def __init__(self, allow_missing_http: bool = False): if allow_missing_http: # reference: https://gist.github.com/dperini/729294, slightly modified to match _ and allow missing "http" url_pattern = u"(?:(?:https?|ftp)://)?"\ u"(?:\S+(?::\S*)?@)?(?:"\ u"(?!(?:10|127)(?:\.\d{1,3}){3})"\ u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\ u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\ u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\ u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\ u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\ u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\ u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\ u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?" else: # reference: https://gist.github.com/dperini/729294, slightly modified to match _ url_pattern = u"(?:(?:https?|ftp)://)"\ "(?:\S+(?::\S*)?@)?(?:"\ u"(?!(?:10|127)(?:\.\d{1,3}){3})"\ u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\ u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\ u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\ u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\ u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\ u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\ u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\ u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?" RegexExtractor.__init__(self, pattern=url_pattern, extractor_name="url extractor")
def __init__(self, support_Bech32: bool=False): if support_Bech32: # a regex support Bech32 type (which is not supported for most applications) bitcoin_address_pattern = r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-zA-HJ-NP-Z0-9]{39}|bc1[a-zA-HJ-NP-Z0-9]{59}\b" else: # simple version supporting P2PKH and P2SH bitcoin_address_pattern = r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b" RegexExtractor.__init__(self, pattern=bitcoin_address_pattern, extractor_name="bitcoin address extractor")
def test_match_mode_with_group(self) -> None: regexp = RegexExtractor('(.)@(.)', 'test_extractor') test_str = 'a@1, b@2, c@3, d@4' extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.MATCH) extractions_with_flag_5 = regexp.extract(test_str, 5, MatchMode.MATCH) res_with_flag_0 = [ex.value for ex in extractions_with_flag_0] res_with_flag_5 = [ex.value for ex in extractions_with_flag_5] expected_res_with_flag_0 = ['a', '1'] expected_res_with_flag_5 = ['b', '2'] self.assertEqual(res_with_flag_0, expected_res_with_flag_0) self.assertEqual(res_with_flag_5, expected_res_with_flag_5)
def run(args): """ Args: args (argparse.Namespace) """ regex_extractor = RegexExtractor(pattern=args.pattern) with warnings.catch_warnings(): warnings.simplefilter('ignore') for line in args.input_file: extractions = regex_extractor.extract(line) for e in extractions: print(e.value)
def test_search_mode_without_group(self) -> None: regexp = RegexExtractor('.@.', 'test_extractor') test_str = 'testtesttest, a@1, b@2, c@3, d@4' extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.SEARCH) extractions_with_flag_18 = regexp.extract(test_str, 18, MatchMode.SEARCH) res_with_flag_0 = [ex.value for ex in extractions_with_flag_0] res_with_flag_18 = [ex.value for ex in extractions_with_flag_18] expected_res_with_flag_0 = ['a@1'] expected_res_with_flag_18 = ['b@2'] self.assertEqual(res_with_flag_0, expected_res_with_flag_0) self.assertEqual(res_with_flag_18, expected_res_with_flag_18)
def test_split_mode(self) -> None: regexp = RegexExtractor(',', 'test_extractor') test_str = 'a@1, b@2, c@3, d@4' extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.SPLIT) extractions_with_flag_2 = regexp.extract(test_str, 2, MatchMode.SPLIT) res_with_flag_0 = [ex.value for ex in extractions_with_flag_0] res_with_flag_2 = [ex.value for ex in extractions_with_flag_2] expected_res_with_flag_0 = ['a@1', ' b@2', ' c@3', ' d@4'] expected_res_with_flag_2 = ['a@1', ' b@2', ' c@3, d@4'] self.assertEqual(res_with_flag_0, expected_res_with_flag_0) self.assertEqual(res_with_flag_2, expected_res_with_flag_2)
def test_findall_mode_without_group(self) -> None: regexp = RegexExtractor('.@.', 'test_extractor') test_str = 'a@1, b@2, c@3, d@4' extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.FINDALL) extractions_with_flag_5 = regexp.extract(test_str, 5, MatchMode.FINDALL) res_with_flag_0 = [ex.value for ex in extractions_with_flag_0] res_with_flag_5 = [ex.value for ex in extractions_with_flag_5] expected_res_with_flag_0 = ['a@1', 'b@2', 'c@3', 'd@4'] expected_res_with_flag_5 = ['b@2', 'c@3', 'd@4'] self.assertEqual(res_with_flag_0, expected_res_with_flag_0) self.assertEqual(res_with_flag_5, expected_res_with_flag_5)
def __init__(self): e_name = 'cryptographic hash extractor' self._regex_extractors = [ RegexExtractor(r"(\b[a-fA-F\d]{32}\b)", 'md5 ' + e_name, general_tag='md5'), RegexExtractor(r"(\b[0-9a-f]{40}\b)", 'sha1 ' + e_name, general_tag='sha1'), RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)", 'sha256 ' + e_name, general_tag='sha256'), ] Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=e_name)
def __init__(self): hostname_pattern = r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+" \ r"(?!html|php|jsp|xml|pdf|asp|css|aspx|phtml)[a-zA-Z]{2,6}\b" RegexExtractor.__init__(self, pattern=hostname_pattern, extractor_name="hostname extractor")
def __init__(self): cve_pattern = r"CVE-(?:\d{4})-(?:\d{4,7})" RegexExtractor.__init__(self, pattern=cve_pattern, flags=re.IGNORECASE, extractor_name="cve extractor")
def __init__(self): cve_pattern = r"CVE-(?:\d{4})-(?:\d{4})" RegexExtractor.__init__(self, pattern=cve_pattern, extractor_name="cve extractor")
def __init__(self): ip_address_pattern = r"(?:(?:[01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])" \ r"[ (?:\[]?(?:\.|dot)[ )\]]?){3}(?:[01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])" RegexExtractor.__init__(self, pattern=ip_address_pattern, extractor_name="ip address extractor")