def load_password_database(): """ Load DAWG of cracked passwords from pre defined filepath and returns it as a dawg.DAWG. """ result = dawg.DAWG() result.load(PASSWORD_DATABASE_FILENAME) return result
def build_test_data(): dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg') dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg') bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4')) dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg') record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)), ('foo', (3, 2, 1)), ('foobar', (6, 3, 0))) dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg') int_data = {'foo': 1, 'bar': 5, 'foobar': 3} dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg') dawg.IntCompletionDAWG(int_data).save( 'dev_data/small/int_completion_dawg.dawg') dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg') dawg.RecordDAWG(str("=H"), [ (k, (len(k), )) for k in TestPrediction.DATA ]).save('dev_data/small/prediction-record.dawg') create_dawg().save('dev_data/large/dawg.dawg') create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg') create_record_dawg().save('dev_data/large/record_dawg.dawg') create_int_dawg().save('dev_data/large/int_dawg.dawg')
def load_dawg_caches(self, correct_cache_fn, correction_indeces_fn, corrections_list_fn): self.correct_cache = dawg.DAWG().load(correct_cache_fn) self.correction_indeces_cache = \ dawg.IntDAWG().load(correction_indeces_fn) self.corrections_list = cPickle.load(open(corrections_list_fn))
def test_no_segfaults_after_wrong_stream(self): d = dawg.DAWG() wrong_path = tempfile.mktemp() # file doesn't exists with pytest.raises(IOError): d.load(wrong_path) assert 'random-key' not in d # there is possible segfault
def main(): max_length = 1 compound_word_pos = defaultdict(set) for line in open(sys.argv[1]): data, _, _ = line.rstrip().split(" ") word, pos, _ = data.split("|") max_length = max(len(word.split("_")), max_length) if len(word.split("_")) <= 1: continue compound_word_pos[word].add(pos) compound_words_dawg = dawg.DAWG([k for k, v in compound_word_pos.items()]) with open(sys.argv[3], "w") as fout: start = time.time() for e, line in enumerate(open(sys.argv[2])): line = line.strip() if line == "": continue elif line == "<doc>": print("<doc>", file=fout) continue elif line == "</doc>": print("</doc>", file=fout) continue processed_tokens = [] tokens = [ x.split("__")[0] + ("*" if x.split("__")[1] in ["NNP", "NNPS"] else "") for x in line.split(" ") ] i = 0 while i < len(tokens): chars = "_".join( [x.split("__")[0] for x in tokens[i:i + max_length]]) common_prefix = compound_words_dawg.prefixes(chars) if common_prefix: common_prefix_max = max(common_prefix, key=lambda x: len(x)) processed_tokens.append(common_prefix_max) length = len(common_prefix_max.split("_")) i += length else: processed_tokens.append(tokens[i]) i += 1 print(" ".join(processed_tokens), file=fout) if (e + 1) % 1000 == 0: eta = 59853123 / (e + 1) * (time.time() - start) - ( time.time() - start) sys.stdout.write("\rsent: %i/%i\tETA: %f" % (e + 1, 59853123, eta)) sys.stdout.flush()
def _load_s3_inventory(s3_list): if s3_list.suffix == '.dawg': import dawg d = dawg.DAWG() d.load(str(s3_list)) return d with open(s3_list, "r") as f: existing_s3_keys = set(line.strip() for line in f.readlines()) return existing_s3_keys
def test_unicode_sorting(self): key1 = '\U00010345\U0001033f\U00010337\U00010330\U0001033d' key2 = '\uff72\uff9c\uff90\uff7b\uff9e\uff9c' # This apparently depends on Python version: # assert key1 < key2 # assert key1.encode('utf8') > key2.encode('utf8') # Constructor should sort data according to utf8 values, # not according to unicode sorting rules. It will raise an exception # if data is sorted according to unicode rules. dawg.DAWG([key1, key2])
def build_dawg(file_name, out_fl=None): """ takes a file name as input and converts that into a dawg.DAWG """ from helper import open_ import dawg with open_(file_name) as f: L = (l.strip() for l in f) D = dawg.DAWG(L) if not out_fl: f, e = os.path.splitext(file_name) out_fl = f + ".dawg" D.save(out_fl)
def test_sorted_iterable(self): sorted_data = ['bar', 'foo', 'foobar'] contents = "\n".join(sorted_data).encode('utf8') with tempfile.NamedTemporaryFile() as f: f.write(contents) f.seek(0) words = (line.strip() for line in f) d = dawg.DAWG(words, input_is_sorted=True) assert 'bar' in d assert 'foo' in d
def test_no_segfaults_on_invalid_file(self): d = dawg.DAWG() fd, path = tempfile.mkstemp() with open(path, 'w') as f: f.write('foo') with pytest.raises(IOError) as e: d.load(path) assert 'Invalid' in e.args[0] with open(path, 'rb') as f: with pytest.raises(IOError) as e: d.read(f) assert 'Invalid' in e.args[0]
def main(argv): fname_in = "-" if len(argv) < 1 else argv[0] fname_out = "-" if len(argv) < 2 else argv[1] lines = load_lines(fname_in) dog = dawg.DAWG() insert_lines_into_dawg(dog, lines) dog.compress() with (sys.stdout.buffer if fname_out == "-" else open(fname_out, "wb")) as file_out: file_out.write(dog.write()) return 0
def create_dawgs(self, list_of_filepaths, force_pickle=True): all_dawgs = [] for file in list_of_filepaths: s_time = time.time() words = self.file_handler.get_words(file) base_dawg = dawg.DAWG(words) completion_dawg = dawg.CompletionDAWG(words) all_dawgs.append((base_dawg, completion_dawg)) print( f"Created DAWGs {list_of_filepaths.index(file) + 1}/{len(list_of_filepaths)} " f"| TIME: {time.time() - s_time}") if force_pickle: self.pickle_dawg(f"base_dawg_{list_of_filepaths.index(file) + 1}.pkl", base_dawg) self.pickle_dawg(f"completion_dawg_{list_of_filepaths.index(file) + 1}.pkl", completion_dawg) return all_dawgs
def add_to_ignorelist(self, to_ignore): """ Add a list of strings to the internally held tuple of strings to ignore in processing text Example: bagmaker = WordBagMaker() bagmaker.add_to_ignorelist(ignore.get_list()) bagmaker.add_to_ignorelist(nltk.corpus.stopwords.words('english')) bagmaker.add_to_ignorelist(list(string.punctuation)) Args: list_to_ignore: List of strings to ignore. """ # wrap in list so can accept non-iterables to_ignore = [to_ignore] if isinstance(to_ignore, str) else to_ignore self._ignore = list(self._ignore) [self._ignore.append(i) for i in to_ignore] self._ignore = set(self._ignore) self._ignore = tuple(self._ignore) self._ignore_dawg = dawg.DAWG(self._ignore)
#pip install DAWG-Python #installs ok, but it is a read-only version of a wrapper to DAWG #https://pypi.org/project/DAWG-Python/ import time import timeit import text_example import memory_profiler import dawg # if __name__ == "__main__": print(("RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0]))) # avoid building a temporary list of words in Python, store directly in the # DAWG t1 = time.time() words_dawg = dawg.DAWG(text_example.readers) t2 = time.time() print(("RAM after creating dawg {:0.1f}MiB, took {:0.1f}s".format(memory_profiler.memory_usage()[0], t2 - t1))) assert 'Zwiebel' in words_dawg time_cost = sum(timeit.repeat(stmt="u'Zwiebel' in words_dawg", setup="from __main__ import words_dawg", number=1, repeat=10000)) print(("Summed time to lookup word {:0.4f}s".format(time_cost))) t1 = time.time() words_dawg.save('words_dawg.saved') t2 = time.time() d = dawg.DAWG() with open('words_dawg.saved', 'rb') as f:
import timeit import dawg import gaddag with open('collins.txt', 'r') as inFile: wordy = [x.strip() for x in inFile.readlines()] words = set(wordy) normalgaddag = gaddag.GADDAG(words) normaldawg = dawg.DAWG(wordy) complete = dawg.CompletionDAWG(wordy) # print(normalgaddag.root["b"]['a']) # So we can do single letter word follows GOOD s = ''' import dawg; import gaddag; with open('collins.txt','r') as inFile: wordy = [x.strip() for x in inFile.readlines()]; words = set(wordy); normalgaddag = gaddag.GADDAG(words); normaldawg = dawg.DAWG(wordy); complete = dawg.CompletionDAWG(wordy); ''' def timest(stmt):
type=str, default=fugenlaute) arguments = parser.parse_args(sys.argv[1:]) print(timestamp(), "loading word2ved model") #word2vec_model = #gensim.models.Word2Vec.load_word2vec_format(arguments.word2vec_file, binary=True) word2vec_model = gensim.models.Word2Vec.load(arguments.word2vec_file) print(timestamp(), "building vocabulary ...") prefix_vocab, suffix_vocab = build_vocabulary( word2vec_model, min_length=arguments.min_word_length) if arguments.build_dawg_name: print(timestamp(), "building dawg models") dawg_model = dawg.DAWG(prefix_vocab) lower_suffix_dawg_model = dawg.DAWG( set(w.lower() for w in suffix_vocab)) print(timestamp(), "saving dawg models") dawg_model.save(arguments.build_dawg_name + ".prefixes") lower_suffix_dawg_model.save(arguments.build_dawg_name + ".suffixes") else: print(timestamp(), "loading dawg models") dawg_model = dawg.DAWG() dawg_model.load(arguments.dawg_name + ".prefixes") lower_suffix_dawg_model = dawg.DAWG(arguments.dawg_name + ".suffixes") candidates = defaultdict(set) print(timestamp(), "prefix pass ...") add_prefix_combinations(candidates, prefix_vocab,
def build_dawg_from_dumps(): """ Return a directed acyclic word graph (dawg.DAWG) containing every password (listed on a new line) in every file in password_dumps directory. """ return dawg.DAWG(get_valid_passwords_from_dumps())
def __init__(self, tokens): self.trie = dawg.DAWG(tokens)
def test_contains_with_null_bytes(self): d = dawg.DAWG(['foo']) assert b'foo' in d assert b'foo\x00bar' not in d
def test_build_errors(self): with pytest.raises(dawg.Error): data = [b'foo\x00bar', b'bar'] dawg.DAWG(data)
import datetime start = datetime.datetime.now() import dawg words = open('/usr/share/dict/american-english', 'r').read().splitlines() d = dawg.DAWG(words) cd = dawg.CompletionDAWG(words) matrix = [] visited = [] n = int(input()) for i in range(n): matrix.append(list(input().split(' '))) visited.append([False for x in range(n)]) def do_word_exist(word): return rand() > 0.5 def is_in_range(i, j): if i < 0 or j < 0: return False if i >= n or j >= n: return False return True mem = {}
def save_dawg(output_file, inventory_manifest): import dawg s3_objs = (rec.Key for rec in list_inventory(inventory_manifest)) d = dawg.DAWG(s3_objs) d.save(output_file)
def __init__(self, lang): self.dawg = dawg.DAWG() self.dawg.load(os.path.join(os.path.dirname(__file__), lang + '.dawg'))
#pip instal DAWG failed #https://github.com/pytries/DAWG/issues/31 #$ python text_example_dawg.py #pip install DAWG-Python #installs ok, but it is a read-only version of a wrapper to DAWG #https://pypi.org/project/DAWG-Python/ import time import timeit import text_example import memory_profiler import dawg # if __name__ == "__main__": print(("RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0]))) t2 = time.time() words_dawg = dawg.DAWG() with open('words_dawg.saved', 'rb') as f: words_dawg.read(f) t3 = time.time() print(t3-t2) print(("RAM after load {:0.1f}MiB".format(memory_profiler.memory_usage()[0]))) assert 'Zwiebel' in words_dawg time_cost = sum(timeit.repeat(stmt="u'Zwiebel' in words_dawg", setup="from __main__ import words_dawg", number=1, repeat=10000)) print(("Summed time to lookup word {:0.4f}s".format(time_cost)))
def create_dawg(): words = words100k() return dawg.DAWG(words)
def load_as_dawg(filepath): with open(filepath, 'r') as f: data = dawg.DAWG(x.rstrip() for x in f.readlines()) return data
def create_leet_dawg(): return dawg.DAWG(LEET_50k)
def test_dawg_prediction(self, word, prediction): d = dawg.DAWG(self.DATA) assert d.similar_keys(word, self.REPLACES) == prediction
''' String data in a DAWG may take 200x less memory than in a standard Python dict and the raw lookup speed is comparable; it also provides fast advanced methods like prefix search. ''' import dawg words = [u'foo', u'bar', u'foobar', u'foö', u'bör'] base_dawg = dawg.DAWG(words) completion_dawg = dawg.CompletionDAWG(words) print("foo" in base_dawg) print(completion_dawg.has_keys_with_prefix(u'f')) print(base_dawg.prefixes(u'foobarz')) <<<<<<< HEAD # print(completion_dawg.has(u'f')) ======= print(completion_dawg.has(u'f')) >>>>>>> 7aea316fb7211c19240808b49e999c9f2e0561f2
def main(argv): nfailed = 0 arg1 = "words.txt" if not argv else argv[0] dog = dawg.DAWG() lines = load_lines(arg1) nloaded = len(lines) log("{} lines loaded".format(nloaded)) log("filtering lines") lines = filter_lines(lines) nlines = len(lines) insert_suffixes = True insert_suffixes = False ninserted = 0 for n, line in enumerate(lines): for i in range(1 if not insert_suffixes else len(line)): dog.insert(line[i:]) ninserted += 1 if (n % 100) == 0 or n + 1 == len(lines): sys.stderr.write("inserted line {} of {}\r".format(n + 1, nlines)) sys.stderr.flush() sys.stderr.write("\n") log("{} of {} ({}%)) lines inserted".format(ninserted, nlines, 100.0 * ninserted / nlines)) nfailed += test_dawg(dog, lines) dumpf = "dump0.txt" log("dump to {}".format(dumpf)) dog.dump_strings(dumpf) log("compress") dog.compress() nfailed += test_dawg(dog, lines) dumpf = "dump1.txt" log("dump to {}".format(dumpf)) dog.dump_strings(dumpf) bin_fname = arg1 + ".dawg" log("write to binary {}".format(bin_fname)) dog.write(bin_fname) log("read from binary {}".format(bin_fname)) dog2 = dawg.DAWG.read(bin_fname) dumpf = "dump2.txt" log("dump binary to {}".format(dumpf)) dog2.dump_strings(dumpf) log("test binary") nfailed += test_dawg(dog2, lines) return -1 if nfailed else 0