def get_2_gram_result(input_domain): input_domain = input_domain if input_domain == '': return 0 # print 'input domain is: %s'%input_domain dic = {} dic_good = {} tmp_sum = 0 bi_gram_list = [] bi_gram_list = calc_ngram(input_domain, 2) count = 0 #count how many time this n-gram appears in domain for each in bi_gram_list: count = count + each[1] # print 'bi_gram_list is: %s'%bi_gram_list for item in bi_gram_list: # each[4] is bi-gram list if item[0] in dic_2gram: #item[0] is bi-gram letters tmp_2gram_count = dic_2gram[item[0]] else: tmp_2gram_count = 0 tmp_sum = float(tmp_sum) + float(tmp_2gram_count) # print 'tmp_sum is now: %s'%tmp_sum if len(bi_gram_list) == 0: tmp_2gram_nor_score = 0 else: tmp_2gram_nor_score = float(tmp_sum)/count print 'float(tmp_sum) is: %s'%float(tmp_sum) print 'count is: %s'%count # print 'tmp_2gram_nor_score_ratio is: %s'%(tmp_2gram_nor_score/1024908267229.0) return (tmp_2gram_nor_score/1024908267229.0)
def define_useragent(useragents, output='score'): """Define individual elements of hte useragent. Useragents should be a list of suspect useragents to evaluate. Default output will be JSON Blob containing descriptive information. Set output='score' to receive a threat score for each UA. If you're calling this from your own application, be sure 'useragents' is a list. If you're expecting score to be returned, leave output alone, if you want the features, change output to json. """ response_dict = {'results': {}} allowlist = ('curl', 'mobileasset', 'microsoft ncsi' ) # Always permit these agents blocklist = ('mozilla/4.0') # Always block these user agents for agent in useragents: pua = woothee.parse(agent) open_count = len([x for x in list(agent) if x in ['(', '[']]) close_count = len([x for x in list(agent) if x in [')', ']']]) response_dict['results'].update({agent: {}}) allow = block = False if agent.split(' ')[0].lower() in allowlist: allow = True elif agent.split(' ')[0].lower() in blocklist: block = True response_dict['results'][agent].update(pua) response_dict['results'][agent].update({'allowlisted': alllow}) response_dict['results'][agent].update({'blocklisted': block}) response_dict['results'][agent].update( {'tokens': len(agent.split(' '))}) response_dict['results'][agent].update( {'ngrams': [x for x in pyngram.calc_ngram(agent, 2) if x[1] > 1]}) if open_count != close_count: # unbalanced response_dict['results'][agent].update({'unbalanced': True}) else: response_dict['results'][agent].update({'unbalanced': False}) if ';' in agent and '; ' not in agent: # Malformed, should be '; ' between settings response_dict['results'][agent].update( {'malformed_semicolon': True}) else: response_dict['results'][agent].update( {'malformed_semicolon': False}) if '/' in agent and ' ' in agent and '(' not in agent: response_dict['results'][agent].update({'malformed_noparen': True}) else: response_dict['results'][agent].update( {'malformed_noparen': False}) if '==' in agent or '<' in agent or '>' in agent or '`' in agent: # SQLi/XSS Tactics response_dict['results'][agent].update( {'malformed_hacklang': True}) else: response_dict['results'][agent].update( {'malformed_hacklang': False}) response_dict['results'][agent].update( {'length': len(agent)}) # Length is kinda interesting if output == 'json': return response_dict else: return _score(response_dict)
def _pattern_bigram(pattern_str, bigram): if len(pattern_str) == 1: return 0 temp_list = calc_ngram(pattern_str, 2) temp_dic = {} for each in temp_list: temp_dic[each[0]] = each[1] if bigram in temp_dic: return temp_dic[bigram] else: return 0
def _pattern_bigram(pattern_str,bigram): if len(pattern_str) == 1: return 0 temp_list = calc_ngram(pattern_str,2) temp_dic = {} for each in temp_list: temp_dic[each[0]] = each[1] if bigram in temp_dic: return temp_dic[bigram] else: return 0
def _count_bigram(pattern_str, bigram): """Function that counts how many times the given bigram appears """ if len(pattern_str) == 1: return 0 temp_list = calc_ngram(pattern_str, 2) temp_dic = {} for each in temp_list: temp_dic[each[0]] = each[1] if bigram in temp_dic: return temp_dic[bigram] else: return 0
def ave_trigram_nor_score(self, file): ''' count how many times number is mentioned return int self.cmudict ''' ret = 0 total = 0 L = 0 def nsyl(word): return [len(list(y for y in x if y[-1].isdigit())) for x in self.cmudict[word.lower()]] with open(file) as f: for line in f: exclude = set(string.punctuation) #remove all the ounctuations line = ''.join(ch for ch in line if ch not in exclude) #tokenize tokenized_lst = self.tokenize(line) #print 'tokenized_lst is {}'.format(tokenized_lst) for w in tokenized_lst: #>>> calc_ngram('gooogle', 2) #[('oo', 2), ('go', 1), ('gl', 1), ('le', 1), ('og', 1)] ngram_lst = calc_ngram(w, 3) local_total = 0 local_L = 0 if ngram_lst: for each in ngram_lst: if each[0] in self.trigram_nor_dic: #print 'self.trigram_nor_dic[each[0]] is {0} of type {1}'.format(self.trigram_nor_dic[each[0]], type(self.trigram_nor_dic[each[0]])) local_total += (self.trigram_nor_dic[each[0]]*each[1]) #print 'for 2-gram {0}: count->{1}'.format(each[0], self.trigram_nor_dic[each[0]]) else: #print '[Warning]{} not found in trigram_nor_dic'.format(each[0]) local_total += (1*each[1]) local_L += each[1] #print 'for word {0}: local_total->{1}, local_L-> {2}'.format(w, local_total, local_L) else: local_L = 1 total += (local_total/float(local_L)) L += 1 if L == 0: L = 1 return total/float(L)
# -*- coding: utf-8 -*- """ Created on Sun Jul 26 15:49:21 2015 @author: bryan_000 """ from pyngram import calc_ngram txt = 'aaaabbbcccdb' results = calc_ngram(txt, 2) print results import nltk a = nltk.bigrams(txt) print nltk