def create_model(modelfunc, fname='', listw=[], outfname=''): """:modelfunc: is a function that takes a word and returns its splits. for ngram model this function returns all the ngrams of a word, for PCFG it will return te split of the password. So, it takes a string and returns a list of strings """ pws = [] if fname: pws = helper.open_get_line(fname) def join_iterators(_pws, listw): for p in _pws: yield p for p in listw: yield p big_dict = defaultdict(int) for pw, c in join_iterators(pws, listw): for ng in modelfunc(pw): big_dict[unicode(ng)] += c big_dict['__TOTAL__'] = sum(big_dict.values()) nDawg = dawg.IntCompletionDAWG(big_dict) if not outfname: outfname = 'tmpmodel.dawg' nDawg.save(outfname) return nDawg
def load_from_files(pickle_fn, dawg_fn, prefix_dawg_fn): entity_db = cPickle.load(open(pickle_fn, "rb")) entity_db.dawg = dawg.IntCompletionDAWG() entity_db.dawg.load(dawg_fn) entity_db.long_entities = dawg.IntDAWG() entity_db.long_entities.load(prefix_dawg_fn) return entity_db
def build_test_data(): dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg') dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg') bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4')) dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg') record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)), ('foo', (3, 2, 1)), ('foobar', (6, 3, 0))) dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg') int_data = {'foo': 1, 'bar': 5, 'foobar': 3} dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg') dawg.IntCompletionDAWG(int_data).save( 'dev_data/small/int_completion_dawg.dawg') dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg') dawg.RecordDAWG(str("=H"), [ (k, (len(k), )) for k in TestPrediction.DATA ]).save('dev_data/small/prediction-record.dawg') create_dawg().save('dev_data/large/dawg.dawg') create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg') create_record_dawg().save('dev_data/large/record_dawg.dawg') create_int_dawg().save('dev_data/large/int_dawg.dawg')
def create_model(modelfunc, fname='', listw=[], outfname='', limit=int(3e6), min_pwlen=6, topk=10000, sep=r'\s+'): """:modelfunc: is a function that takes a word and returns its splits. for ngram model this function returns all the ngrams of a word, for PCFG it will return splits of the password. @modelfunc: func: string -> [list of strings] @fname: name of the file to read from @listw: list of passwords. Used passwords from both the files and listw if provided. @outfname: the file to write down the model. """ def length_filter(pw): pw = ''.join(c for c in pw if c in VALID_CHARS) return len(pw) >= min_pwlen pws = [] if fname: pws = helper.open_get_line(fname, limit=limit, pw_filter=length_filter, sep=sep) big_dict = defaultdict(int) total_f, total_e = 0, 0 # Add topk passwords from the input dataset to the list topk_pws = [] for pw, c in itertools.chain(pws, listw): for ng in modelfunc(pw): big_dict[ng] += c total_f += c total_e += 1 if len(big_dict) % 100000 == 0: print(("Dictionary size: {} (Total_freq: {}; Total_pws: {}"\ .format(len(big_dict), total_f, total_e))) if len(topk_pws) >= topk: heapq.heappushpop(topk_pws, (c, pw)) else: heapq.heappush(topk_pws, (c, pw)) # Adding topk password to deal with probability reduction of popular # passwords. Mostly effective for n-gram models print("topk={}".format(topk)) if topk > 0: for c, pw in topk_pws: tpw = helper.START + pw + helper.END big_dict[tpw] += c total_f += c total_e += 1 big_dict[NPWS_W] = total_e big_dict[TOTALF_W] = total_f nDawg = dawg.IntCompletionDAWG(big_dict) if not outfname: outfname = 'tmpmodel.dawg.gz' elif not outfname.endswith('.gz'): outfname += '.gz' pathlib.Path(outfname).parent.mkdir(parents=True, exist_ok=True) helper.save_dawg(nDawg, outfname) return nDawg
def finalize(self): for cache in self.caches.itervalues(): cache.finalize() self.value_cache.finalize() logging.info("Finalizing values...") self.finalize_values() logging.info("Creating main dawg...") self.dawg = dawg.IntCompletionDAWG(self.d) del self.d self.finalize_long_entities() logging.info("finalizing done.")
class RouterView(object): """ Route all HTTP requests to the corresponding view. """ KLASS_EXCLUSIONS = { HTMLTemplateView, VulnerableTemplateView, StaticFileView, FormTemplateView, RawPathTemplateView } DIR_EXCLUSIONS = set() FILE_EXCLUSIONS = {'__init__.py'} def __init__(self): self._plugin_families = set(get_plugin_families()) # Will be generated on autoregister self._mapping = None self._view_instances = [] self._view_files = [] self._autoregister() def _autoregister(self): """ We go through the moth/views/ directory, importing all the modules and finding subclasses of VulnerableTemplateView. When we find one, we get the URL pattern from it, create an instance and call _register. :return: None, calls _register which stores the info in _mapping. """ data = [] for fname in self._get_vuln_view_files( self._get_vuln_view_directory()): for klass in self._get_views_from_file(fname): try: view_obj = klass() except Exception, e: msg = 'An exception occured while trying to register %s: "%s"' raise RuntimeError(msg % (view_obj, e)) else: self._view_instances.append(view_obj) view_index = len(self._view_instances) - 1 data.append((view_obj.get_unicode_url_path(), view_index)) self._mapping = dawg.IntCompletionDAWG(data)
def create_model(modelfunc, fname='', listw=[], outfname=''): """:modelfunc: is a function that takes a word and returns its splits. for ngram model this function returns all the ngrams of a word, for PCFG it will return te split of the password. @modelfunc: func: string -> [list of strings] @fname: name of the file to read from @listw: list of passwords. Used passwords from both the files and listw if provided. @outfname: the file to write down the model. """ pws = [] if fname: pws = helper.open_get_line(fname, limit=3e6) def join_iterators(_pws, listw): for p in _pws: yield p for p in listw: yield p big_dict = defaultdict(int) total_f, total_e = 0, 0 for pw, c in join_iterators(pws, listw): for ng in modelfunc(pw): big_dict[ng] += c if len(big_dict) % 100000 == 0: print("Dictionary size: {}".format(len(big_dict))) total_f += c total_e += 1 big_dict['__TOTAL__'] = total_e big_dict['__TOTALF__'] = total_f nDawg = dawg.IntCompletionDAWG(big_dict) if not outfname: outfname = 'tmpmodel.dawg' nDawg.save(outfname) return nDawg
import numpy as np import datetime import re import time import dawg from nltk import ngrams from heapq import nlargest from operator import itemgetter with open('../data/ngram_dict.csv', mode='r') as infile: reader = csv.reader(infile) # ngram_dict = {rows[0]:rows[1] for rows in reader} ngram_data = [ tuple([line[0], int(line[1])]) for line in reader if int(line[1]) > 2 ] ngram_dict = dawg.IntCompletionDAWG(ngram_data) # prefix_suffix_pairs_background = pd.read_csv("../data/prefix_suffix_pairs.txt") # Importing all the necessary dictionaries suffixes = pd.read_csv("../data/Freq_background.csv", index_col='Unnamed: 0') with open('../data/sorted_popular_queries.csv', mode='r') as infile: reader = csv.reader(infile) skipheader = next(reader) data = [ tuple([line[1], int(line[2])]) for line in reader if int(line[2]) > 2 ] sortedpopulardict = dawg.IntCompletionDAWG(data) data = [] ngram_data = []
return list(itertools.chain(*p)) else: return wrap_func((func, data)) def diff(oldG, newG): """ returns the difference of the two grammars. """ if not (isinstance(oldG, dict) and isinstance(newG, dict)): yield (oldG, newG) else: for k in oldG.keys(): if k not in newG: yield k else: vold, vnew = oldG[k], newG[k] if vold != vnew: diff(oldG[k], newG[k]) if __name__=='__main__': import dawg pws = dict(get_line(open_(sys.argv[1])), lim=1e7) new_fname = sys.argv[1].replace('.tar.bz', '.dawg') if new_fname == sys.argv[1]: new_fname = sys.argv[1].split('.', 1)[0] + '.dawg' assert new_fname != sys.argv[1], "Give a better name to your original file." T = dawg.IntCompletionDAWG(pws.items()) T.save('{}'.format(new_fname))
def test_no_segfaults_on_empty_dawg(self): d = dawg.IntCompletionDAWG([]) assert d.keys() == []
def empty_dawg(self): return dawg.IntCompletionDAWG()
def dawg(self): return dawg.IntCompletionDAWG((k, len(k)) for k in self.keys)
def read_dawg(fname): print("reading {fname}".format(fname=fname)) return dawg.IntCompletionDAWG(fname).load(fname)
def read_dawg(fname): nDawg = dawg.IntCompletionDAWG(fname) nDawg.load(fname) return nDawg
def create_int_completion_dawg(): words = words100k() values = [len(word) for word in words] return dawg.IntCompletionDAWG(zip(words, values))
def create_trie(self): """ Create a DAWG from the keys for fast prefix retrieval """ self.dawg = dawg.IntCompletionDAWG(self.keys)