def is_domain(domain): if len(domain) > 253: return False l = domain.split('.') if len(l) == 1: if l[0] == '*' or is_tld(l[0]): return True elif len(l) > 1: if is_tld(l[-1]) is False: return False for i in l[:-1]: if is_domain_tag(i) is False: return False return True return False
def get_valid_urls(string): regex = r"((?:https?://)?[A-Za-z0-9_\-]+\.([A-Za-z0-9_\-\.]+))" urls = re.findall(regex, string) # Find anything that look like URL valid_urls = list() for el in urls: if is_tld(el[1]): # el[1] is match group 1, it supposed to be valid_urls.append( el[0] ) # if top level domain (el[1]) if valid, so url (el[0]) is return valid_urls
def _check_urls(d): res = [] for i in d: for u in i['urls']: try: tld_try = get_tld(u) if tld_try and is_tld(tld_try): res.append(i) else: continue except: continue return res
def _check_tld(d): res = [] for v in d: try: tld_try = get_tld(v['_name'], fix_protocol=True) if tld_try and is_tld(tld_try): res.append(v) else: continue except: continue return res
def _init_top_level_domain(cls, top_level_domain: str = None) -> str: """ Assign or generate a top level domain Returns: str: assign or generate a random top level domain eg: .com, .co.uk """ if top_level_domain and isinstance(top_level_domain, str): if is_tld(top_level_domain[1:]): return top_level_domain raise ValueError("%s is not a valid top level domain" % top_level_domain) if top_level_domain: return TypeError( "top_level_domain kwarg should be an instance of str") return get_top_level_domain()
def validate_domain(self, url) -> str: """ Attempt to clean the provided url, and pull return the domain, or ip address """ is_valid_tld = tld.is_tld(url) # looks like a domain if is_valid_tld: res = tld.get_tld(url, fix_protocol=True, as_object=True) return res.parsed_url.netloc # not a domain, try ip address: if not is_valid_tld: parsed_url = urllib.parse.urlparse(url) if not parsed_url.netloc: # add the //, so that our url reading code # parses it properly parsed_url = urllib.parse.urlparse(f"//{url}") return parsed_url.netloc
def dts_tld_check(data): if len(data) == 1 and is_tld(data['gtld']): return True return False
def processor(self, pdf_doc, output, title): print(f"{tc.DOTSEP}\n{tc.GREEN} [ Gathering IOCs ]{tc.RESET}") pages = list(extractor(pdf=pdf_doc)) try: text = "".join(filter(None, pages)) except TypeError: print(f"Broken sentence: {''.join(filter(None, pages))}") raise else: # create output file if output: write_file(rep=title, results=f"\nTITLE: {title} \nPATH: {pdf_doc}\n", opt="w") # Language detection def lang_proc(selection): if helper.lang_patts(text).get(selection): self.counter += 1 spec = "".join(helper.lang_patts(text).get(selection)) print( f"\n{tc.FOUND}{tc.BOLD}{selection}{tc.RESET}\n{tc.SEP}\n{spec}" ) if output: write_file( rep=title, results=f"\n{selection}\n{'-' * 15}\n{spec}", opt="a") # remove from dict to avoid repeat pattern helper.lang_patts(text).pop(selection) # Attempt to detect specific language characters languages = ["ARABIC", "CYRILLIC", "CHINESE", "FARSI", "HEBREW"] list(map(lang_proc, languages)) # Detect patterns exclude = ("gov", "foo", "bar", "py") for key, pattern in helper.patts(text).items(): if pattern: self.counter += 1 sorted_set = sorted(set(pattern)) if key == "DOMAIN": for domain in pattern: tld = domain.split(".")[-1] try: while not is_tld(tld) or tld in exclude: sorted_set.remove(domain) except ValueError: pass pattern = "\n".join(sorted_set) print( f"\n{tc.FOUND}{tc.BOLD}{key}{tc.RESET}\n{tc.SEP}\n{pattern}" ) if output: write_file(rep=title, results=f"\n{key}\n{'-' * 15}\n{pattern}\n", opt="a") if self.counter == 0: print(f"{tc.YELLOW}= No IOCs found ={tc.RESET}") if output: write_file(rep=title, results="= No IOCs found =", opt="w")
def get_iocs(candidate): """ Determine if candidate string DIRECTLY corresponds to indicator regex, and return the type and value of the first match. This means long paragraphs are not expected to have each candidate examined and enumerated. A candidate string is evaluated, in the given order, to determine if it contains regex that lends it to an ioc. If it does, extract the candidate grouping and return the first match. This is not intended to give a full and accurate accounting, it is merely to facilitate rapid triage of a binary prior to ADDITIONAL analysis. """ m_url = re.search( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|' r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', candidate) if m_url: url = m_url.group(0) if validators.url(url): return 'url', url, m_url.start(0) # must have a tld in fld m_domain = re.search( r'[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])\.' r'[.a-zA-Z]{2,}', candidate) if m_domain: domain = m_domain.group(0) t = domain.rsplit('.', 1)[-1] if tld.is_tld(t) and validators.domain(domain): return 'domain', domain, m_domain.start(0) else: log.info('Failed to validate: %s', domain) # ref: # https://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb m_ipv4 = re.search( r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', candidate) # noqa:E501 if m_ipv4: ipv4 = m_ipv4.group(0) if validators.ipv4(ipv4): return 'ipv4', ipv4, m_ipv4.start(0) # ref: # https://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb m_ipv6 = re.search( r'((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?', candidate) # noqa:E501 if m_ipv6: ipv6 = m_ipv6.group(0) if validators.ipv6(ipv6): return 'ipv6', ipv6, m_ipv6.start(0) m_email = re.search( r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]' r'+\.[a-zA-Z0-9-.]+$)', candidate) if m_email: email = m_email.group(0) if validators.email(email): return 'email', email, m_email.start(0) m_filename = re.search( r'\b([A-Za-z0-9-_.]{1,255}\.' r'(exe|dll|bat|sys|htm|html|cfg|gz|tar|rpm|7z|' r'js|jar|jpg|png|vb|scr|pif|chm|zip|rar|cab|' r'pdf|doc|docx|ppt|pptx|py|pyc|txt|msg|csv|wps|' r'xls|xlsx|swf|gif|pps|xml|vcxproj|sln|pl|msi|tmp' r'c|cpp|cs|h|java|lua|class|sh|bak)' r')\b', candidate) if m_filename: fname = m_filename.group(0) return 'filename', fname, m_filename.start(0) return 'unknown', candidate, 0