Python unconfuse Exemples, confusables.unconfuse Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : catch_phishing.py Projet : xenoid/phishing_catcher

def score_domain(domain):
    """Score `domain`.

    The highest score, the most probable `domain` is a phishing site.

    Args:
        domain (str): the domain to check.

    Returns:
        int: the score of `domain`.
    """
    score = 0
    for t in suspicious['tlds']:
        if domain.endswith(t):
            score += 20

    # Remove initial '*.' for wildcard certificates bug
    if domain.startswith('*.'):
        domain = domain[2:]

    # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com)
    try:
        res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)
        domain = '.'.join([res.subdomain, res.domain])
    except Exception:
        pass

    # Higer entropy is kind of suspicious
    score += int(round(entropy(domain)*10))

    # Remove lookalike characters using list from http://www.unicode.org/reports/tr39
    domain = unconfuse(domain)

    words_in_domain = re.split("\W+", domain)

    # ie. detect fake .com (ie. *.com-account-management.info)
    if words_in_domain[0] in ['com', 'net', 'org']:
        score += 10

    # Testing keywords
    for word in suspicious['keywords']:
        if word in domain:
            score += suspicious['keywords'][word]

    # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol)
    for key in [k for (k,s) in suspicious['keywords'].items() if s >= 70]:
        # Removing too generic keywords (ie. mail.domain.com)
        for word in [w for w in words_in_domain if w not in ['email', 'mail', 'cloud']]:
            if distance(str(word), str(key)) == 1:
                score += 70

    # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com)
    if 'xn--' not in domain and domain.count('-') >= 4:
        score += domain.count('-') * 3

    # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq)
    if domain.count('.') >= 3:
        score += domain.count('.') * 3

    return score

Exemple #2

0

Afficher le fichier

def score_domain(config, domain, args):
    """ """
    # dbugger = ['------------------------------------------------------------']
    # dbugger.append(domain)
    score = 0
    for t in config["tlds"]:
        if domain.endswith(t):
            score += 20
            # dbugger.append("TLD: {}".format(t))

    try:
        res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True)

        if res is not None:
            domain = '.'.join([res.subdomain, res.domain])
    except Exception as err:
        failed_message(args, err, domain)
        pass

    score += int(round(entropy.shannon_entropy(domain)*50))
    # dbugger.append("Entropy: {}".format(int(round(entropy.shannon_entropy(domain)*50))))

    domain          = unconfuse(domain)
    words_in_domain = re.split(r"\W+", domain)

    if words_in_domain[0] in ["com", "net", "org"]:
        score += 10
        # dbugger.append("Com-net-org: {}".format(words_in_domain[0]))

    for word in config["keywords"]:
        if word in domain:
            score += config["keywords"][word]
            # dbugger.append("Keyword: {}".format(len(config["keywords"])))
            # dbugger.append("Keyword: {}".format(word))

    for key in [k for (k,s) in config["keywords"].items() if s >= 70]:
        for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]:
            if distance(str(word), str(key)) == 1:
                score += 70
                # dbugger.append("Distance: {}, {}".format(str(word), str(key)))

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3
        # dbugger.append("Count dashes: {}".format(domain.count(".")))

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
        # dbugger.append("Count period: {}".format(domain.count(".")))

    # dbugger.append("\nScore: {}".format(score))
    # dbugger.append('------------------------------------------------------------')

    # with open("dbug_file", "a") as dbug_file:
    #     for dbug in dbugger:
    #         dbug_file.write("{}\n".format(dbug))
    return score

Exemple #3

0

Afficher le fichier

def callback(message, context):
    """Callback handler for certstream events."""
    if message['message_type'] == "heartbeat":
        return

    if message['message_type'] == "certificate_update":
        all_domains = message['data']['leaf_cert']['all_domains']

        force_break = False
        for domain in all_domains:
            pbar.update(1)

            current_domain = domain.lower()
            uc_current_domain = unconfuse(current_domain)

            for ignore in keywords_ignore:
                if ignore in current_domain or ignore in uc_current_domain:
                    force_break = True
                    break
            if force_break:
                break

            for alert in keywords_alert:
                if alert in current_domain or alert in uc_current_domain:

                    not_before = datetime.utcfromtimestamp(
                        int(message['data']['leaf_cert']
                            ['not_after'])).strftime('%Y-%m-%d %H:%M:%S')
                    not_after = datetime.utcfromtimestamp(
                        int(message['data']['leaf_cert']
                            ['not_after'])).strftime('%Y-%m-%d %H:%M:%S')
                    serial_number = message['data']['leaf_cert'][
                        'serial_number']
                    fingerprint = message['data']['leaf_cert']['fingerprint']
                    ca = message['data']['chain'][0]['subject']['aggregated']

                    tqdm.tqdm.write("[+] Match found {}".format(
                        colored(current_domain,
                                'red',
                                attrs=['underline', 'bold'])))
                    tqdm.tqdm.write("      Matched on {}".format(alert))
                    tqdm.tqdm.write(
                        "      [Details] Not before {}, Not after {}, Serial {}"
                        .format(not_before, not_after, serial_number))
                    tqdm.tqdm.write(
                        "      [Details] Fingerprint {}, CA {}".format(
                            fingerprint, ca))

                    with open(log_suspicious, 'a') as f:
                        now = time.strftime("%Y-%m-%d %H:%M:%S")
                        f.write("{},{},{},{},{},{},{},{}\n".format(
                            now, alert, current_domain, not_before, not_after,
                            serial_number, fingerprint, ca))

Exemple #4

0

Afficher le fichier

Fichier : commons.py Projet : ooo777/analyst_arsenal

def score_domain(suspicious, domain, args):
    """ """
    score = 0
    for t in suspicious["tlds"]:
        if domain.endswith(t):
            score += 20

    try:
        res = get_tld(domain,
                      as_object=True,
                      fail_silently=True,
                      fix_protocol=True)

        if res is not None:
            domain = '.'.join([res.subdomain, res.domain])
    except Exception as err:
        failed_message(args, err, domain)
        pass

    score += int(round(entropy.shannon_entropy(domain) * 50))

    domain = unconfuse(domain)
    words_in_domain = re.split(r"\W+", domain)

    if words_in_domain[0] in ["com", "net", "org"]:
        score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for (k, s) in suspicious["keywords"].items() if s >= 70]:
        for word in [
                w for w in words_in_domain
                if w not in ["email", "mail", "cloud"]
        ]:
            if distance(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3

    return score

Exemple #5

0

Afficher le fichier

def score_domain(provided_ioc):
    """Return the scores of the provided domain."""
    score = 0

    for suspicious_tld in suspicious["tlds"]:
        if provided_ioc.endswith(suspicious_tld):
            score += 20

    try:
        res = tld.get_tld(provided_ioc,
                          as_object=True,
                          fail_silently=True,
                          fix_protocol=True)
        domain = ".".join([res.subdomain, res.domain])
    except Exception:
        domain = provided_ioc

    score += int(round(entropy.shannon_entropy(domain) * 50))
    domain = confusables.unconfuse(domain)
    words_in_domain = re.split("\W+", domain)

    if domain.startswith("*."):
        domain = domain[2:]

        if words_in_domain[0] in ["com", "net", "org"]:
            score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for k, v in suspicious["keywords"].items() if v >= 70]:
        for word in [
                w for w in words_in_domain
                if w not in ["email", "mail", "cloud"]
        ]:
            if pylev.levenshtein(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score

Exemple #6

0

Afficher le fichier

    def enrich(self):
        # each of the internal methods that gets more data. there is a set sequence on these
        # self.__get_dns_data()
        # pprint.pprint((self.dns_records))
        # self.__get_ip2cidr()
        # pprint.pprint((self.asn_records))
        # self.__get_whois()  this is too slow. we need to find a reason to run who_is
        # pprint.pprint(self.whois_records)  # not working ???
        # let's see which score high enough to look into
        score = 0
        for t in TLDS:
            if self.domain.endswith(t):
                score += 20
        # Removing TLD to catch inner TLD in subdomain (ie. passportoffice.gov.uk-otherdomain.com)
        try:
            res = get_tld(self.domain,
                          as_object=True,
                          fail_silently=True,
                          fix_protocol=True)
            domain_without_outer_tld = '.'.join([res.subdomain, res.domain])
        except Exception:
            domain_without_outer_tld = ""
            pass

        self.__get_entropy()
        # TODO this is a lazy calculation. Should we should base it on actual entropy?
        score += int(round(self.shannon_entropy * 10))

        # Remove lookalike characters using list from http://www.unicode.org/reports/tr39
        unconfused_domain_without_tld = unconfuse(domain_without_outer_tld)
        #print(f"domain:{self.domain},innertld:{domain_without_outer_tld},unconfuse:{unconfused_domain_without_tld}")
        # at this point we've stripped off the outer TLD and unconfused the domain.
        words_in_domain = re.split("\W+", unconfused_domain_without_tld)

        # look for fake, hidden .coms ie. govuk.com-account-management.info
        if words_in_domain[0] in ['com', 'net', 'org']:
            score += 10

        # score the word based on triggerwords.yaml
        for word in SCORES:
            if word in unconfused_domain_without_tld:
                score += SCORES[word]
        #================
        # TODO calculate score based on lev
        # do we even need this check, since we're not using the output below....
        self.__get_levenshtein()

        # TODO- figure out
        # compare the domain against the strong match words
        # aka hmrc, taxrefund, dvla
        for key in [k for (k, s) in SCORES.items() if s >= 70]:
            # this isn't going to be effecticve for overly generic words like 'email'
            for word in [
                    w for w in words_in_domain
                    if w not in ['email', 'mail', 'cloud']
            ]:
                #LOG.info(f"word:{word}")
                #LOG.info(f"key:{key}")
                if Levenshtein.distance(str(word), str(key)) == 1:
                    score += 70

        # Lots of hyphens '-'
        if 'xn--' not in self.domain and self.domain.count('-') >= 4:
            score += self.domain.count('-') * 3

        # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq)
        if self.domain.count('.') >= 3:
            score += self.domain.count('.') * 3

        self.score = score
        #print(
        #    f"domain:{self.domain},score:{score},matched on:{self.match},entropy:{self.shannon_entropy},levenshtein ratio:{self.levenshtein_ratio},Levenshtein distance:{self.levenshtein_distance}")

        # we need to decide how to filter this list.
        # many won't be malicious
        """ Lev
            when we want to analyse these... 
            > 0.8 : Red?
            > 0.4 : Amber?
            <0.4 : green?
        """
        """
            shannon
            >4:     red
            3.5-4:  amber
        """
        # malicious TLD - add 20
        #     # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com)
        """