Beispiel #1
0
    def __init__(self, raw_text=None, text_title=None):

        try:
            # props for internal use
            self._raw_text = raw_text
            self._text_title = text_title

            # props to store data
            self._summary = str()
            self._keywords = set()
            self._iocs = dict()
            self._tlp = None
            self._debug = dict({'iocs': dict(), 'keywords': dict()})

            if self._raw_text != None:
                if not type(self._raw_text) is unicode:
                    self._raw_text = self._raw_text.decode('utf8')
                self._tlpfilter = TLPFilter()
                self._clean_text = self._tlpfilter.text(self._raw_text)
                self._blob = TextBlob(self._raw_text)
                self._clean_blob = TextBlob(self._clean_text)

        except Exception as e:
            import traceback
            traceback.print_exc()
Beispiel #2
0
    def __init__(self, raw_text=None, text_title=None):

        try:
            # props for internal use
            self._raw_text = raw_text
            self._text_title = text_title

            # props to store data
            self._summary = str()
            self._keywords = set()
            self._iocs = dict()
            self._tlp = None
            self._debug = dict({'iocs': dict(), 'keywords': dict()})

            if self._raw_text != None:
                if not type(self._raw_text) is unicode:
                    self._raw_text = self._raw_text.decode('utf8')
                self._tlpfilter = TLPFilter()
                self._clean_text = self._tlpfilter.text(self._raw_text)
                self._blob = TextBlob(self._raw_text)
                self._clean_blob = TextBlob(self._clean_text)

        except Exception as e:
            import traceback
            traceback.print_exc()
Beispiel #3
0
class TLP:

    def __init__(self, raw_text=None, text_title=None):

        try:
            # props for internal use
            self._raw_text = raw_text
            self._text_title = text_title

            # props to store data
            self._summary = str()
            self._keywords = set()
            self._iocs = dict()
            self._tlp = None
            self._debug = dict({'iocs': dict(), 'keywords': dict()})

            if self._raw_text != None:
                if not type(self._raw_text) is unicode:
                    self._raw_text = self._raw_text.decode('utf8')
                self._tlpfilter = TLPFilter()
                self._clean_text = self._tlpfilter.text(self._raw_text)
                self._blob = TextBlob(self._raw_text)
                self._clean_blob = TextBlob(self._clean_text)

        except Exception as e:
            import traceback
            traceback.print_exc()


    @property 
    def iocs(self):
        '''returns a filtered list of iocs'''

        try:
            if len(self._iocs) > 0:
                return self._iocs
    
            # prime the dict
            self._iocs = dict((k, set()) for k in regexs)
            
            # parse iocs
            data = self._tlpfilter.iocs(self._raw_text, mode='pre')
            for w in data:
                for name,pattern in regexs.iteritems():
                    if pattern.match(w):
                         self._iocs[name].add(w)
            self._iocs = self._tlpfilter.iocs(self._iocs, mode='post')
            for key in self._iocs:
                self._debug['iocs'][key] = len(self._iocs[key])
            return self._iocs

        except Exception as e:
            raise e


    @property
    def text(self):
        '''returns the complete filtered text'''

        try:
            return "  ".join([s.raw for s in self._clean_blob.sentences])

        except Exception as e:
            raise e


    @property
    def debug(self):
        '''returns debug info -  must run 'keywords' or 'iocs' to populate'''
        return self._debug


    @property 
    def summary(self):
        '''returns document summary'''

        try:
            if len(self._summary) > 0:
                return self._summary
            
            sentences = self._clean_blob.sentences
            slen = len(sentences)
            sixth_pctl = int(math.floor(slen * .06))
            if sixth_pctl < 8:
                summ_len = sixth_pctl if sixth_pctl > 2 else 2
            else:
                summ_len = 8
            
            return "  ".join([s.raw for s in sentences[:summ_len]])

        except Exception as e:
            raise e


    @property
    def color(self):
        '''returns tlp color (if present)'''

        try:
            bigrams = ngrams(self._raw_text.split(), 2)
            colors = set()
            for b in bigrams:
                (one, two) = b
                if re.search('(?:tlp|TLP)', one): 
                    colors.add(two.lower())
                
            return colors 

        except Exception as e:
            raise e

 
    @property
    def keywords(self):
        '''returns document keywords and occurance counts'''

        try:
            if len(self._keywords) > 0:
                return self._keywords
    
            #blob = TextBlob(self.summary)
            blob = TextBlob(self._clean_text)
            keywords = self._blob.words
            keywords = self._tlpfilter.keywords(keywords)
            keywords_counted = dict(Counter(keywords))
            total_count = 0
            keywords_dict = dict()
            for word, count in keywords_counted.iteritems():
    
                if len(word) == 0:
                    continue
        
                # you're certainly not popular if you only occur once
                # if you are popular, and you're longer than 3 chars, you win
    
                total_count += count if count > 1 else 0
                pos_array = nltk.pos_tag(nltk.word_tokenize(word))
                w,pos = pos_array[0]
                if re.search('.*[NN|NP]$', pos):
                    if len(w) > 3:
                        keywords_dict[word] = count 
            
            keyword_scores = [v for (k,v) in keywords_dict.iteritems()]
            keywords_count = np.count_nonzero(keyword_scores)
            keywords_mean = np.mean(keyword_scores)
            keywords_std = np.std(keyword_scores)
    
            self._debug['keywords']['total'] = sum(keyword_scores)
            self._debug['keywords']['mean'] = keywords_mean
            self._debug['keywords']['std'] = keywords_std
            
            new_dict = dict([(k,v) for (k,v) in keywords_dict.iteritems() if v > (keywords_mean + (keywords_std * 4))])
            self._keywords = sorted(new_dict.items(), key=operator.itemgetter(1), reverse = True)
    
            return self._keywords
            
        except Exception as e:
            raise e
Beispiel #4
0
class TLP:
    def __init__(self, raw_text=None, text_title=None):

        try:
            # props for internal use
            self._raw_text = raw_text
            self._text_title = text_title

            # props to store data
            self._summary = str()
            self._keywords = set()
            self._iocs = dict()
            self._tlp = None
            self._debug = dict({'iocs': dict(), 'keywords': dict()})

            if self._raw_text != None:
                if not type(self._raw_text) is unicode:
                    self._raw_text = self._raw_text.decode('utf8')
                self._tlpfilter = TLPFilter()
                self._clean_text = self._tlpfilter.text(self._raw_text)
                self._blob = TextBlob(self._raw_text)
                self._clean_blob = TextBlob(self._clean_text)

        except Exception as e:
            import traceback
            traceback.print_exc()

    @property
    def iocs(self):
        '''returns a filtered list of iocs'''

        try:
            if len(self._iocs) > 0:
                return self._iocs

            # prime the dict
            self._iocs = dict((k, set()) for k in regexs)

            # parse iocs
            data = self._tlpfilter.iocs(self._raw_text, mode='pre')
            for w in data:
                for name, pattern in regexs.iteritems():
                    if pattern.match(w):
                        self._iocs[name].add(w)
            self._iocs = self._tlpfilter.iocs(self._iocs, mode='post')
            for key in self._iocs:
                self._debug['iocs'][key] = len(self._iocs[key])
            return self._iocs

        except Exception as e:
            raise e

    @property
    def text(self):
        '''returns the complete filtered text'''

        try:
            return "  ".join([s.raw for s in self._clean_blob.sentences])

        except Exception as e:
            raise e

    @property
    def debug(self):
        '''returns debug info -  must run 'keywords' or 'iocs' to populate'''
        return self._debug

    @property
    def summary(self):
        '''returns document summary'''

        try:
            if len(self._summary) > 0:
                return self._summary

            sentences = self._clean_blob.sentences
            slen = len(sentences)
            sixth_pctl = int(math.floor(slen * .06))
            if sixth_pctl < 8:
                summ_len = sixth_pctl if sixth_pctl > 2 else 2
            else:
                summ_len = 8

            return "  ".join([s.raw for s in sentences[:summ_len]])

        except Exception as e:
            raise e

    @property
    def color(self):
        '''returns tlp color (if present)'''

        try:
            bigrams = ngrams(self._raw_text.split(), 2)
            colors = set()
            for b in bigrams:
                (one, two) = b
                if re.search('(?:tlp|TLP)', one):
                    colors.add(two.lower())

            return colors

        except Exception as e:
            raise e

    @property
    def keywords(self):
        '''returns document keywords and occurance counts'''

        try:
            if len(self._keywords) > 0:
                return self._keywords

            #blob = TextBlob(self.summary)
            blob = TextBlob(self._clean_text)
            keywords = self._blob.words
            keywords = self._tlpfilter.keywords(keywords)
            keywords_counted = dict(Counter(keywords))
            total_count = 0
            keywords_dict = dict()
            for word, count in keywords_counted.iteritems():

                if len(word) == 0:
                    continue

                # you're certainly not popular if you only occur once
                # if you are popular, and you're longer than 3 chars, you win

                total_count += count if count > 1 else 0
                pos_array = nltk.pos_tag(nltk.word_tokenize(word))
                w, pos = pos_array[0]
                if re.search('.*[NN|NP]$', pos):
                    if len(w) > 3:
                        keywords_dict[word] = count

            keyword_scores = [v for (k, v) in keywords_dict.iteritems()]
            keywords_count = np.count_nonzero(keyword_scores)
            keywords_mean = np.mean(keyword_scores)
            keywords_std = np.std(keyword_scores)

            self._debug['keywords']['total'] = sum(keyword_scores)
            self._debug['keywords']['mean'] = keywords_mean
            self._debug['keywords']['std'] = keywords_std

            new_dict = dict([(k, v) for (k, v) in keywords_dict.iteritems()
                             if v > (keywords_mean + (keywords_std * 4))])
            self._keywords = sorted(new_dict.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)

            return self._keywords

        except Exception as e:
            raise e