async def analyze(self, nlpdata: addict.Dict) -> Result: return Result( name=self.name, version=self.version, result=addict.Dict( {s: len(s) for s in nlpdata["sentences"]["split"]}), )
async def analyze(self, nlpdata: addict.Dict) -> Result: tokens = nltk.word_tokenize(nlpdata.content) res = addict.Dict() res.tokens = nltk.pos_tag(tokens) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: text = nlpdata.content res = addict.Dict() res.cve = self.cve.findall(text) res.msid = self.msid.findall(text) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: res = addict.Dict() res.Groups = self.group_re.findall(nlpdata.content) res.Tactics = self.tactic_re.findall(nlpdata.content) res.Techniques = self.technique_re.findall(nlpdata.content) res.SubTechniques = self.subtechnique_re.findall(nlpdata.content) res.Software = self.software_re.findall(nlpdata.content) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: res = addict.Dict() sector_stem_postfix = { "compani", # company, companies, [...], "industri", # industry, industries, [...], "sector", # sector, sectors, [...], "servic", # service, services, [...], "organ", # organization, organizations, [...], "provid", # provider, providers, [...], } posible_tag_types = {"NNP", "NNPS", "NN", "NNS"} lookbefore_tags = {",", ":", "CC"} lookbefore_tags.update(posible_tag_types) ps = nltk.stem.PorterStemmer() pos_sectors: List[Text] = [] # Look through all tokens. If any token relating to a sector is found, # look-before and collect all nouns while the tokens are nouns or part # of a listing. for i, (token, tag) in enumerate(nlpdata.pos_tag.tokens): if tag in posible_tag_types and ps.stem( token) in sector_stem_postfix: n = i - 1 while nlpdata.pos_tag.tokens[n][1] in lookbefore_tags: n -= 1 pos_sectors += [ token for (token, pos_tag) in nlpdata.pos_tag.tokens[n:i] if pos_tag in posible_tag_types ] ini = configparser.ConfigParser() ini.read([os.path.join(self.configdir, "sectors.ini")]) ini["sectors"]["alias"] = os.path.join(self.configdir, ini["sectors"]["alias"]) vocab = Vocabulary(ini["sectors"]) sectors = [] unknown_sectors = [] for pos_sector in pos_sectors: primary = vocab.get(pos_sector, primary=True) if primary: sectors.append(primary) else: unknown_sectors.append(pos_sector) res.sectors = sectors res.unknown_sectors = unknown_sectors return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: ini = configparser.ConfigParser() ini.read([os.path.join(self.configdir, "tools_pattern.ini")]) ini['tools']['alias'] = os.path.join(self.configdir, ini['tools']['alias']) vocab = Vocabulary(ini['tools']) res = addict.Dict() res.Tools = vocab.regex_search(nlpdata.content, debug=self.debug) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: res = addict.Dict() ini = configparser.ConfigParser() ini.read([os.path.join(self.configdir, "locations.ini")]) ini["locations"]["cities"] = os.path.join( self.configdir, "../../vendor", ini["locations"]["cities"] ) ini["locations"]["countries"] = os.path.join( self.configdir, "../../vendor", ini["locations"]["countries"] ) ini["vocabulary"]["alias"] = os.path.join( self.configdir, ini["vocabulary"]["alias"] ) cities = self.cities_from_file(ini["locations"]["cities"]) country_names, country_cc = self.countries_from_file( ini["locations"]["countries"] ) nouns = self.nouns(nlpdata.pos_tag.tokens) vocab = Vocabulary(ini["vocabulary"]) res.cities = [] res.countries = [] res.countries_inferred = [] res.countries_mentioned = [] for noun in nouns: if noun in cities: city = cities[noun] res.cities.append(city) res.countries_inferred.append( country_cc.get(city["country code"], "UNK") ) if noun in country_names: res.countries.append(country_names[noun]) if vocab.get(noun): res.countries_mentioned.append(noun) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: # refang to allo match on e.g. 127[.]0[.]0[.]1 text = nlpdata.content \ .replace("[.]", ".") \ .replace("{.}", ".") \ .replace("(.)", ".") \ .replace("\\.", ".") \ # Replace to make sure URLencoded URLs are supported text = re.sub("%2[fF]", "/", text) res = addict.Dict() res.md5 = self.md5.findall(text) res.sha1 = self.sha1.findall(text) res.sha256 = self.sha256.findall(text) res.email = self.email.findall(text) res.fqdn = [ dn for dn in self.fqdn.findall(text) if dn.split(".")[-1] in TLDS ] res.ipv4 = ['.'.join(ip) for ip in self.ipv4.findall(text)] res.uri = [ re.sub("^hxxp", "http", uri, 0, re.I) for uri in self.uri.findall(text) ] res.ipv4net = self.ipv4net.findall(text) pos_ipv6 = [] for candidate in self.allposipv6.findall(text): try: addr = ipaddress.ip_address(candidate) if addr.version == 6: pos_ipv6.append(candidate) except ValueError: pass res.ipv6 = pos_ipv6 return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: ini = configparser.ConfigParser() ini.read([os.path.join(self.configdir, "threatactor_pattern.ini")]) ini["threat_actor"]["alias"] = os.path.join( self.configdir, ini["threat_actor"]["alias"]) uppercase_abbr = abbreviation_list(ini["threat_actor"].get( "uppercase_abbr", "")) vocab = Vocabulary(ini["threat_actor"]) res = addict.Dict() res.ThreatActors = vocab.regex_search( nlpdata.content, normalize_result=(lambda x: normalize_ta(x, uppercase_abbr)), debug=self.debug, ) return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: res = addict.Dict() threat_stem_postfix = { "threat", # threat "crimin", # criminal, criminals "crime", # crime "espionage", # espionage "hack", # hack, hacking, "hacker", # hacker, hackers "crack", # cracking, crack "cracker", # cracker, crackers "adversari", # adversary, adversaries "terrorist", # terrorist, terrorists } group_stem_postfix = { "group", # group, groups "actor", # actor, actors "unit", # unit, untis "agent", # agent, agents "organ", # organization, organizations } false_positive_filter = [ "top", "unknown", "cyber", ] # "top threat groups", "cyber threat actors" etc... possible_ta_tag_types = {"NNP", "NNPS", "NN", "NNS"} possible_tag_types = {"NNP", "NNPS", "NN", "NNS", "JJ", "JJS"} chain_tags = {",", ":", "CC"} lookbefore_tags: Set[Text] = set() lookbefore_tags.update(chain_tags) lookbefore_tags.update(possible_tag_types) ps = nltk.stem.PorterStemmer() first_stage_found = False pos_actors: List[Text] = [] # Look through all tokens. If any token relating to a threat actors is # found, look-before and collect all nouns while the tokens are nouns # or part of a listing. for i, (token, tag) in enumerate(nlpdata.pos_tag.tokens): if first_stage_found: second_stage_found = bool( tag in possible_tag_types and ps.stem(token) in group_stem_postfix) if not second_stage_found: first_stage_found = False continue if nlpdata.pos_tag.tokens[i - 2][1] not in possible_tag_types: first_stage_found = False continue n = i - 1 while (n > 0 and len(nlpdata.pos_tag.tokens[n]) == 2 and nlpdata.pos_tag.tokens[n][1] in lookbefore_tags): n -= 1 current_actor: List[Text] = [] for (subtoken, pos_tag) in nlpdata.pos_tag.tokens[n:i - 1 # noqa: E203 ]: # check if we have reached a separator (comma, 'and' etc) # if so, we need to create a result of what we have found thus # far and look for more. if pos_tag in chain_tags: if current_actor: if valid_actor(current_actor): pos_actors.append(" ".join(current_actor)) current_actor = [] elif pos_tag in possible_ta_tag_types: if subtoken in false_positive_filter: continue current_actor.append(subtoken) if current_actor: pos_actors.append(" ".join(current_actor)) # check wether the current tag is of a type and in the accepted list of # threat group postfixes. first_stage_found = bool(tag in possible_tag_types and ps.stem(token) in threat_stem_postfix) res.actors = pos_actors return Result(name=self.name, version=self.version, result=res)
async def analyze(self, nlpdata: addict.Dict) -> Result: result = addict.Dict() result.split = [ s.strip() for s in nlpdata.content.split(".") if s.strip() ] return Result(name=self.name, version=self.version, result=result)