Example #1
0
 async def analyze(self, nlpdata: addict.Dict) -> Result:
     return Result(
         name=self.name,
         version=self.version,
         result=addict.Dict(
             {s: len(s)
              for s in nlpdata["sentences"]["split"]}),
     )
Example #2
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        tokens = nltk.word_tokenize(nlpdata.content)

        res = addict.Dict()

        res.tokens = nltk.pos_tag(tokens)

        return Result(name=self.name, version=self.version, result=res)
Example #3
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        text = nlpdata.content

        res = addict.Dict()

        res.cve = self.cve.findall(text)
        res.msid = self.msid.findall(text)

        return Result(name=self.name, version=self.version, result=res)
Example #4
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        res = addict.Dict()

        res.Groups = self.group_re.findall(nlpdata.content)
        res.Tactics = self.tactic_re.findall(nlpdata.content)
        res.Techniques = self.technique_re.findall(nlpdata.content)
        res.SubTechniques = self.subtechnique_re.findall(nlpdata.content)
        res.Software = self.software_re.findall(nlpdata.content)

        return Result(name=self.name, version=self.version, result=res)
Example #5
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        res = addict.Dict()

        sector_stem_postfix = {
            "compani",  # company, companies, [...],
            "industri",  # industry, industries, [...],
            "sector",  # sector, sectors, [...],
            "servic",  # service, services, [...],
            "organ",  # organization, organizations, [...],
            "provid",  # provider, providers, [...],
        }

        posible_tag_types = {"NNP", "NNPS", "NN", "NNS"}
        lookbefore_tags = {",", ":", "CC"}
        lookbefore_tags.update(posible_tag_types)

        ps = nltk.stem.PorterStemmer()

        pos_sectors: List[Text] = []
        # Look through all tokens. If any token relating to a sector is found,
        # look-before and collect all nouns while the tokens are nouns or part
        # of a listing.
        for i, (token, tag) in enumerate(nlpdata.pos_tag.tokens):
            if tag in posible_tag_types and ps.stem(
                    token) in sector_stem_postfix:
                n = i - 1
                while nlpdata.pos_tag.tokens[n][1] in lookbefore_tags:
                    n -= 1
                pos_sectors += [
                    token for (token, pos_tag) in nlpdata.pos_tag.tokens[n:i]
                    if pos_tag in posible_tag_types
                ]

        ini = configparser.ConfigParser()
        ini.read([os.path.join(self.configdir, "sectors.ini")])
        ini["sectors"]["alias"] = os.path.join(self.configdir,
                                               ini["sectors"]["alias"])

        vocab = Vocabulary(ini["sectors"])
        sectors = []
        unknown_sectors = []
        for pos_sector in pos_sectors:
            primary = vocab.get(pos_sector, primary=True)
            if primary:
                sectors.append(primary)
            else:
                unknown_sectors.append(pos_sector)

        res.sectors = sectors
        res.unknown_sectors = unknown_sectors
        return Result(name=self.name, version=self.version, result=res)
Example #6
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        ini = configparser.ConfigParser()
        ini.read([os.path.join(self.configdir, "tools_pattern.ini")])
        ini['tools']['alias'] = os.path.join(self.configdir,
                                             ini['tools']['alias'])

        vocab = Vocabulary(ini['tools'])

        res = addict.Dict()

        res.Tools = vocab.regex_search(nlpdata.content, debug=self.debug)

        return Result(name=self.name, version=self.version, result=res)
Example #7
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        res = addict.Dict()

        ini = configparser.ConfigParser()
        ini.read([os.path.join(self.configdir, "locations.ini")])
        ini["locations"]["cities"] = os.path.join(
            self.configdir, "../../vendor", ini["locations"]["cities"]
        )
        ini["locations"]["countries"] = os.path.join(
            self.configdir, "../../vendor", ini["locations"]["countries"]
        )
        ini["vocabulary"]["alias"] = os.path.join(
            self.configdir, ini["vocabulary"]["alias"]
        )

        cities = self.cities_from_file(ini["locations"]["cities"])
        country_names, country_cc = self.countries_from_file(
            ini["locations"]["countries"]
        )

        nouns = self.nouns(nlpdata.pos_tag.tokens)
        vocab = Vocabulary(ini["vocabulary"])

        res.cities = []
        res.countries = []
        res.countries_inferred = []
        res.countries_mentioned = []

        for noun in nouns:
            if noun in cities:
                city = cities[noun]
                res.cities.append(city)
                res.countries_inferred.append(
                    country_cc.get(city["country code"], "UNK")
                )
            if noun in country_names:
                res.countries.append(country_names[noun])
            if vocab.get(noun):
                res.countries_mentioned.append(noun)

        return Result(name=self.name, version=self.version, result=res)
Example #8
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        # refang to allo match on e.g. 127[.]0[.]0[.]1
        text = nlpdata.content \
                .replace("[.]", ".") \
                .replace("{.}", ".") \
                .replace("(.)", ".") \
                .replace("\\.", ".") \

        # Replace to make sure URLencoded URLs are supported
        text = re.sub("%2[fF]", "/", text)

        res = addict.Dict()

        res.md5 = self.md5.findall(text)
        res.sha1 = self.sha1.findall(text)
        res.sha256 = self.sha256.findall(text)
        res.email = self.email.findall(text)
        res.fqdn = [
            dn for dn in self.fqdn.findall(text) if dn.split(".")[-1] in TLDS
        ]
        res.ipv4 = ['.'.join(ip) for ip in self.ipv4.findall(text)]

        res.uri = [
            re.sub("^hxxp", "http", uri, 0, re.I)
            for uri in self.uri.findall(text)
        ]
        res.ipv4net = self.ipv4net.findall(text)

        pos_ipv6 = []
        for candidate in self.allposipv6.findall(text):
            try:
                addr = ipaddress.ip_address(candidate)
                if addr.version == 6:
                    pos_ipv6.append(candidate)
            except ValueError:
                pass

        res.ipv6 = pos_ipv6

        return Result(name=self.name, version=self.version, result=res)
Example #9
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        ini = configparser.ConfigParser()
        ini.read([os.path.join(self.configdir, "threatactor_pattern.ini")])
        ini["threat_actor"]["alias"] = os.path.join(
            self.configdir, ini["threat_actor"]["alias"])

        uppercase_abbr = abbreviation_list(ini["threat_actor"].get(
            "uppercase_abbr", ""))

        vocab = Vocabulary(ini["threat_actor"])

        res = addict.Dict()

        res.ThreatActors = vocab.regex_search(
            nlpdata.content,
            normalize_result=(lambda x: normalize_ta(x, uppercase_abbr)),
            debug=self.debug,
        )

        return Result(name=self.name, version=self.version, result=res)
Example #10
0
    async def analyze(self, nlpdata: addict.Dict) -> Result:

        res = addict.Dict()

        threat_stem_postfix = {
            "threat",  # threat
            "crimin",  # criminal, criminals
            "crime",  # crime
            "espionage",  # espionage
            "hack",  # hack, hacking,
            "hacker",  # hacker, hackers
            "crack",  # cracking, crack
            "cracker",  # cracker, crackers
            "adversari",  # adversary, adversaries
            "terrorist",  # terrorist, terrorists
        }

        group_stem_postfix = {
            "group",  # group, groups
            "actor",  # actor, actors
            "unit",  # unit, untis
            "agent",  # agent, agents
            "organ",  # organization, organizations
        }

        false_positive_filter = [
            "top",
            "unknown",
            "cyber",
        ]  # "top threat groups", "cyber threat actors" etc...

        possible_ta_tag_types = {"NNP", "NNPS", "NN", "NNS"}
        possible_tag_types = {"NNP", "NNPS", "NN", "NNS", "JJ", "JJS"}
        chain_tags = {",", ":", "CC"}
        lookbefore_tags: Set[Text] = set()
        lookbefore_tags.update(chain_tags)
        lookbefore_tags.update(possible_tag_types)

        ps = nltk.stem.PorterStemmer()

        first_stage_found = False

        pos_actors: List[Text] = []
        # Look through all tokens. If any token relating to a threat actors is
        # found, look-before and collect all nouns while the tokens are nouns
        # or part of a listing.
        for i, (token, tag) in enumerate(nlpdata.pos_tag.tokens):
            if first_stage_found:
                second_stage_found = bool(
                    tag in possible_tag_types
                    and ps.stem(token) in group_stem_postfix)
                if not second_stage_found:
                    first_stage_found = False
                    continue

                if nlpdata.pos_tag.tokens[i - 2][1] not in possible_tag_types:
                    first_stage_found = False
                    continue
                n = i - 1
                while (n > 0 and len(nlpdata.pos_tag.tokens[n]) == 2
                       and nlpdata.pos_tag.tokens[n][1] in lookbefore_tags):
                    n -= 1

                current_actor: List[Text] = []
                for (subtoken,
                     pos_tag) in nlpdata.pos_tag.tokens[n:i - 1  # noqa: E203
                                                        ]:
                    # check if we have reached a separator (comma, 'and' etc)
                    # if so, we need to create a result of what we have found thus
                    # far and look for more.
                    if pos_tag in chain_tags:
                        if current_actor:
                            if valid_actor(current_actor):
                                pos_actors.append(" ".join(current_actor))
                            current_actor = []
                    elif pos_tag in possible_ta_tag_types:
                        if subtoken in false_positive_filter:
                            continue
                        current_actor.append(subtoken)
                if current_actor:
                    pos_actors.append(" ".join(current_actor))

            # check wether the current tag is of a type and in the accepted list of
            # threat group postfixes.
            first_stage_found = bool(tag in possible_tag_types
                                     and ps.stem(token) in threat_stem_postfix)

        res.actors = pos_actors
        return Result(name=self.name, version=self.version, result=res)
Example #11
0
 async def analyze(self, nlpdata: addict.Dict) -> Result:
     result = addict.Dict()
     result.split = [
         s.strip() for s in nlpdata.content.split(".") if s.strip()
     ]
     return Result(name=self.name, version=self.version, result=result)