Exemple #1
0
def is_domain(domain):
    if len(domain) > 253:
        return False
    l = domain.split('.')
    if len(l) == 1:
        if l[0] == '*' or is_tld(l[0]):
            return True
    elif len(l) > 1:
        if is_tld(l[-1]) is False:
            return False
        for i in l[:-1]:
            if is_domain_tag(i) is False:
                return False
        return True
    return False
Exemple #2
0
def get_valid_urls(string):
    regex = r"((?:https?://)?[A-Za-z0-9_\-]+\.([A-Za-z0-9_\-\.]+))"
    urls = re.findall(regex, string)  # Find anything that look like URL
    valid_urls = list()
    for el in urls:
        if is_tld(el[1]):  # el[1] is match group 1, it supposed to be
            valid_urls.append(
                el[0]
            )  # if top level domain (el[1]) if valid, so url (el[0]) is
    return valid_urls
Exemple #3
0
def _check_urls(d):
    res = []
    for i in d:
        for u in i['urls']:
            try:
                tld_try = get_tld(u)
                if tld_try and is_tld(tld_try):
                    res.append(i)
                else:
                    continue
            except:
                continue
   
    return res
Exemple #4
0
def _check_tld(d):
    res = []
    for v in d:
        try:
            tld_try = get_tld(v['_name'], fix_protocol=True)

            if tld_try and is_tld(tld_try):
                res.append(v)
            else:
                continue
        except:
            continue
    
    return res 
Exemple #5
0
 def _init_top_level_domain(cls, top_level_domain: str = None) -> str:
     """
     Assign or generate a top level domain
     Returns:
         str: assign or generate a random top level domain
         eg: .com, .co.uk
     """
     if top_level_domain and isinstance(top_level_domain, str):
         if is_tld(top_level_domain[1:]):
             return top_level_domain
         raise ValueError("%s is not a valid top level domain" %
                          top_level_domain)
     if top_level_domain:
         return TypeError(
             "top_level_domain kwarg should be an instance of str")
     return get_top_level_domain()
    def validate_domain(self, url) -> str:
        """
        Attempt to clean the provided url, and pull
        return the domain, or ip address
        """

        is_valid_tld = tld.is_tld(url)

        # looks like a domain
        if is_valid_tld:
            res = tld.get_tld(url, fix_protocol=True, as_object=True)
            return res.parsed_url.netloc

        # not a domain, try ip address:
        if not is_valid_tld:
            parsed_url = urllib.parse.urlparse(url)
            if not parsed_url.netloc:
                # add the //, so that our url reading code
                # parses it properly
                parsed_url = urllib.parse.urlparse(f"//{url}")
            return parsed_url.netloc
Exemple #7
0
def dts_tld_check(data):
    if len(data) == 1 and is_tld(data['gtld']):
        return True
    return False
Exemple #8
0
    def processor(self, pdf_doc, output, title):
        print(f"{tc.DOTSEP}\n{tc.GREEN} [ Gathering IOCs ]{tc.RESET}")
        pages = list(extractor(pdf=pdf_doc))
        try:
            text = "".join(filter(None, pages))
        except TypeError:
            print(f"Broken sentence: {''.join(filter(None, pages))}")
            raise
        else:
            # create output file
            if output:
                write_file(rep=title,
                           results=f"\nTITLE: {title} \nPATH: {pdf_doc}\n",
                           opt="w")

            # Language detection
            def lang_proc(selection):
                if helper.lang_patts(text).get(selection):
                    self.counter += 1
                    spec = "".join(helper.lang_patts(text).get(selection))
                    print(
                        f"\n{tc.FOUND}{tc.BOLD}{selection}{tc.RESET}\n{tc.SEP}\n{spec}"
                    )
                    if output:
                        write_file(
                            rep=title,
                            results=f"\n{selection}\n{'-' * 15}\n{spec}",
                            opt="a")

                    # remove from dict to avoid repeat pattern
                    helper.lang_patts(text).pop(selection)

            # Attempt to detect specific language characters
            languages = ["ARABIC", "CYRILLIC", "CHINESE", "FARSI", "HEBREW"]
            list(map(lang_proc, languages))

            # Detect patterns
            exclude = ("gov", "foo", "bar", "py")
            for key, pattern in helper.patts(text).items():
                if pattern:
                    self.counter += 1
                    sorted_set = sorted(set(pattern))

                    if key == "DOMAIN":
                        for domain in pattern:
                            tld = domain.split(".")[-1]
                            try:
                                while not is_tld(tld) or tld in exclude:
                                    sorted_set.remove(domain)
                            except ValueError:
                                pass

                    pattern = "\n".join(sorted_set)
                    print(
                        f"\n{tc.FOUND}{tc.BOLD}{key}{tc.RESET}\n{tc.SEP}\n{pattern}"
                    )
                    if output:
                        write_file(rep=title,
                                   results=f"\n{key}\n{'-' * 15}\n{pattern}\n",
                                   opt="a")

            if self.counter == 0:
                print(f"{tc.YELLOW}= No IOCs found ={tc.RESET}")
                if output:
                    write_file(rep=title, results="= No IOCs found =", opt="w")
Exemple #9
0
def get_iocs(candidate):
    """
    Determine if candidate string DIRECTLY corresponds to indicator regex,
    and return the type and value of the first match.

    This means long paragraphs are not expected to have each candidate
    examined and enumerated. A candidate string is evaluated, in the given
    order, to determine if it contains regex that lends it to an ioc. If it
    does, extract the candidate grouping and return the first match.

    This is not intended to give a full and accurate accounting, it is merely
    to facilitate rapid triage of a binary prior to ADDITIONAL analysis.
    """

    m_url = re.search(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', candidate)
    if m_url:
        url = m_url.group(0)
        if validators.url(url):
            return 'url', url, m_url.start(0)

    # must have a tld in fld
    m_domain = re.search(
        r'[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])\.'
        r'[.a-zA-Z]{2,}', candidate)
    if m_domain:
        domain = m_domain.group(0)
        t = domain.rsplit('.', 1)[-1]
        if tld.is_tld(t) and validators.domain(domain):
            return 'domain', domain, m_domain.start(0)
        else:
            log.info('Failed to validate: %s', domain)

    # ref:
    # https://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb
    m_ipv4 = re.search(
        r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)',
        candidate)  # noqa:E501
    if m_ipv4:
        ipv4 = m_ipv4.group(0)
        if validators.ipv4(ipv4):
            return 'ipv4', ipv4, m_ipv4.start(0)

    # ref:
    # https://nbviewer.jupyter.org/github/rasbt/python_reference/blob/master/tutorials/useful_regex.ipynb
    m_ipv6 = re.search(
        r'((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?',
        candidate)  # noqa:E501

    if m_ipv6:
        ipv6 = m_ipv6.group(0)
        if validators.ipv6(ipv6):
            return 'ipv6', ipv6, m_ipv6.start(0)

    m_email = re.search(
        r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]'
        r'+\.[a-zA-Z0-9-.]+$)', candidate)
    if m_email:
        email = m_email.group(0)
        if validators.email(email):
            return 'email', email, m_email.start(0)

    m_filename = re.search(
        r'\b([A-Za-z0-9-_.]{1,255}\.'
        r'(exe|dll|bat|sys|htm|html|cfg|gz|tar|rpm|7z|'
        r'js|jar|jpg|png|vb|scr|pif|chm|zip|rar|cab|'
        r'pdf|doc|docx|ppt|pptx|py|pyc|txt|msg|csv|wps|'
        r'xls|xlsx|swf|gif|pps|xml|vcxproj|sln|pl|msi|tmp'
        r'c|cpp|cs|h|java|lua|class|sh|bak)'
        r')\b', candidate)
    if m_filename:
        fname = m_filename.group(0)
        return 'filename', fname, m_filename.start(0)

    return 'unknown', candidate, 0