Exemple #1
0
def load_password_database():
    """ Load DAWG of cracked passwords from pre defined filepath and returns
        it as a dawg.DAWG.
    """
    result = dawg.DAWG()
    result.load(PASSWORD_DATABASE_FILENAME)
    return result
Exemple #2
0
def build_test_data():

    dawg.CompletionDAWG(['f', 'bar', 'foo',
                         'foobar']).save('dev_data/small/completion.dawg')
    dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')

    bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'),
                  ('foobar', b'data4'))
    dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')

    record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)),
                   ('foo', (3, 2, 1)), ('foobar', (6, 3, 0)))
    dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')

    int_data = {'foo': 1, 'bar': 5, 'foobar': 3}
    dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg')
    dawg.IntCompletionDAWG(int_data).save(
        'dev_data/small/int_completion_dawg.dawg')

    dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
    dawg.RecordDAWG(str("=H"), [
        (k, (len(k), )) for k in TestPrediction.DATA
    ]).save('dev_data/small/prediction-record.dawg')

    create_dawg().save('dev_data/large/dawg.dawg')
    create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
    create_record_dawg().save('dev_data/large/record_dawg.dawg')
    create_int_dawg().save('dev_data/large/int_dawg.dawg')
Exemple #3
0
    def load_dawg_caches(self, correct_cache_fn, correction_indeces_fn,
                        corrections_list_fn):

        self.correct_cache = dawg.DAWG().load(correct_cache_fn)
        self.correction_indeces_cache = \
                dawg.IntDAWG().load(correction_indeces_fn)
        self.corrections_list = cPickle.load(open(corrections_list_fn))
Exemple #4
0
    def test_no_segfaults_after_wrong_stream(self):
        d = dawg.DAWG()
        wrong_path = tempfile.mktemp()  # file doesn't exists

        with pytest.raises(IOError):
            d.load(wrong_path)

        assert 'random-key' not in d  # there is possible segfault
Exemple #5
0
def main():
    max_length = 1
    compound_word_pos = defaultdict(set)
    for line in open(sys.argv[1]):
        data, _, _ = line.rstrip().split(" ")
        word, pos, _ = data.split("|")
        max_length = max(len(word.split("_")), max_length)

        if len(word.split("_")) <= 1:
            continue

        compound_word_pos[word].add(pos)

    compound_words_dawg = dawg.DAWG([k for k, v in compound_word_pos.items()])

    with open(sys.argv[3], "w") as fout:
        start = time.time()
        for e, line in enumerate(open(sys.argv[2])):
            line = line.strip()
            if line == "":
                continue
            elif line == "<doc>":
                print("<doc>", file=fout)
                continue
            elif line == "</doc>":
                print("</doc>", file=fout)
                continue

            processed_tokens = []

            tokens = [
                x.split("__")[0] +
                ("*" if x.split("__")[1] in ["NNP", "NNPS"] else "")
                for x in line.split(" ")
            ]
            i = 0
            while i < len(tokens):
                chars = "_".join(
                    [x.split("__")[0] for x in tokens[i:i + max_length]])
                common_prefix = compound_words_dawg.prefixes(chars)
                if common_prefix:
                    common_prefix_max = max(common_prefix,
                                            key=lambda x: len(x))
                    processed_tokens.append(common_prefix_max)
                    length = len(common_prefix_max.split("_"))
                    i += length
                else:
                    processed_tokens.append(tokens[i])
                    i += 1

            print(" ".join(processed_tokens), file=fout)

            if (e + 1) % 1000 == 0:
                eta = 59853123 / (e + 1) * (time.time() - start) - (
                    time.time() - start)
                sys.stdout.write("\rsent: %i/%i\tETA: %f" %
                                 (e + 1, 59853123, eta))
                sys.stdout.flush()
def _load_s3_inventory(s3_list):
    if s3_list.suffix == '.dawg':
        import dawg
        d = dawg.DAWG()
        d.load(str(s3_list))
        return d

    with open(s3_list, "r") as f:
        existing_s3_keys = set(line.strip() for line in f.readlines())
    return existing_s3_keys
Exemple #7
0
    def test_unicode_sorting(self):
        key1 = '\U00010345\U0001033f\U00010337\U00010330\U0001033d'
        key2 = '\uff72\uff9c\uff90\uff7b\uff9e\uff9c'

        # This apparently depends on Python version:
        # assert key1 < key2
        # assert key1.encode('utf8') > key2.encode('utf8')

        # Constructor should sort data according to utf8 values,
        # not according to unicode sorting rules. It will raise an exception
        # if data is sorted according to unicode rules.
        dawg.DAWG([key1, key2])
Exemple #8
0
def build_dawg(file_name, out_fl=None):
    """
    takes a file name as input and converts that into a dawg.DAWG
    """
    from helper import open_
    import dawg
    with open_(file_name) as f:
        L = (l.strip() for l in f)
        D = dawg.DAWG(L)
        if not out_fl:
            f, e = os.path.splitext(file_name)
            out_fl = f + ".dawg"
        D.save(out_fl)
Exemple #9
0
    def test_sorted_iterable(self):

        sorted_data = ['bar', 'foo', 'foobar']
        contents = "\n".join(sorted_data).encode('utf8')
        with tempfile.NamedTemporaryFile() as f:
            f.write(contents)
            f.seek(0)

            words = (line.strip() for line in f)
            d = dawg.DAWG(words, input_is_sorted=True)

        assert 'bar' in d
        assert 'foo' in d
Exemple #10
0
    def test_no_segfaults_on_invalid_file(self):
        d = dawg.DAWG()
        fd, path = tempfile.mkstemp()
        with open(path, 'w') as f:
            f.write('foo')

        with pytest.raises(IOError) as e:
            d.load(path)
            assert 'Invalid' in e.args[0]

        with open(path, 'rb') as f:
            with pytest.raises(IOError) as e:
                d.read(f)
                assert 'Invalid' in e.args[0]
Exemple #11
0
def main(argv):
    fname_in = "-" if len(argv) < 1 else argv[0]
    fname_out = "-" if len(argv) < 2 else argv[1]

    lines = load_lines(fname_in)

    dog = dawg.DAWG()
    insert_lines_into_dawg(dog, lines)
    dog.compress()

    with (sys.stdout.buffer
          if fname_out == "-" else open(fname_out, "wb")) as file_out:
        file_out.write(dog.write())

    return 0
Exemple #12
0
    def create_dawgs(self, list_of_filepaths, force_pickle=True):
        all_dawgs = []
        for file in list_of_filepaths:
            s_time = time.time()
            words = self.file_handler.get_words(file)
            base_dawg = dawg.DAWG(words)
            completion_dawg = dawg.CompletionDAWG(words)
            all_dawgs.append((base_dawg, completion_dawg))

            print(
                f"Created DAWGs {list_of_filepaths.index(file) + 1}/{len(list_of_filepaths)} "
                f"| TIME: {time.time() - s_time}")
            
            if force_pickle:
                self.pickle_dawg(f"base_dawg_{list_of_filepaths.index(file) + 1}.pkl", base_dawg)
                self.pickle_dawg(f"completion_dawg_{list_of_filepaths.index(file) + 1}.pkl", completion_dawg)
        return all_dawgs
Exemple #13
0
    def add_to_ignorelist(self, to_ignore):
        """
        Add a list of strings to the internally held tuple of strings to ignore in processing text
        Example:
            bagmaker = WordBagMaker()
            bagmaker.add_to_ignorelist(ignore.get_list())
            bagmaker.add_to_ignorelist(nltk.corpus.stopwords.words('english'))
            bagmaker.add_to_ignorelist(list(string.punctuation))

        Args:
            list_to_ignore: List of strings to ignore.
        """
        # wrap in list so can accept non-iterables
        to_ignore = [to_ignore] if isinstance(to_ignore, str) else to_ignore
        self._ignore = list(self._ignore)
        [self._ignore.append(i) for i in to_ignore]
        self._ignore = set(self._ignore)
        self._ignore = tuple(self._ignore)
        self._ignore_dawg = dawg.DAWG(self._ignore)
Exemple #14
0
#pip install DAWG-Python
#installs ok, but it is a read-only version of a wrapper to DAWG
#https://pypi.org/project/DAWG-Python/

import time
import timeit
import text_example
import memory_profiler
import dawg # 

if __name__ == "__main__":
    print(("RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0])))
    # avoid building a temporary list of words in Python, store directly in the
    # DAWG
    t1 = time.time()
    words_dawg = dawg.DAWG(text_example.readers)
    t2 = time.time()
    print(("RAM after creating dawg {:0.1f}MiB, took {:0.1f}s".format(memory_profiler.memory_usage()[0], t2 - t1)))

    assert 'Zwiebel' in words_dawg
    time_cost = sum(timeit.repeat(stmt="u'Zwiebel' in words_dawg",
                                  setup="from __main__ import words_dawg",
                                  number=1,
                                  repeat=10000))
    print(("Summed time to lookup word {:0.4f}s".format(time_cost)))

    t1 = time.time()
    words_dawg.save('words_dawg.saved')
    t2 = time.time()
    d = dawg.DAWG()
    with open('words_dawg.saved', 'rb') as f:
Exemple #15
0
import timeit

import dawg
import gaddag

with open('collins.txt', 'r') as inFile:
    wordy = [x.strip() for x in inFile.readlines()]
    words = set(wordy)

normalgaddag = gaddag.GADDAG(words)
normaldawg = dawg.DAWG(wordy)
complete = dawg.CompletionDAWG(wordy)

# print(normalgaddag.root["b"]['a'])
# So we can do single letter word follows GOOD

s = '''
import dawg;
import gaddag;

with open('collins.txt','r') as inFile:
    wordy = [x.strip() for x in inFile.readlines()];
    words = set(wordy);

normalgaddag = gaddag.GADDAG(words);
normaldawg = dawg.DAWG(wordy);
complete = dawg.CompletionDAWG(wordy);
'''


def timest(stmt):
                        type=str,
                        default=fugenlaute)

    arguments = parser.parse_args(sys.argv[1:])

    print(timestamp(), "loading word2ved model")
    #word2vec_model = #gensim.models.Word2Vec.load_word2vec_format(arguments.word2vec_file, binary=True)
    word2vec_model = gensim.models.Word2Vec.load(arguments.word2vec_file)

    print(timestamp(), "building vocabulary ...")
    prefix_vocab, suffix_vocab = build_vocabulary(
        word2vec_model, min_length=arguments.min_word_length)

    if arguments.build_dawg_name:
        print(timestamp(), "building dawg models")
        dawg_model = dawg.DAWG(prefix_vocab)
        lower_suffix_dawg_model = dawg.DAWG(
            set(w.lower() for w in suffix_vocab))
        print(timestamp(), "saving dawg models")
        dawg_model.save(arguments.build_dawg_name + ".prefixes")
        lower_suffix_dawg_model.save(arguments.build_dawg_name + ".suffixes")
    else:
        print(timestamp(), "loading dawg models")
        dawg_model = dawg.DAWG()
        dawg_model.load(arguments.dawg_name + ".prefixes")
        lower_suffix_dawg_model = dawg.DAWG(arguments.dawg_name + ".suffixes")

    candidates = defaultdict(set)
    print(timestamp(), "prefix pass ...")
    add_prefix_combinations(candidates,
                            prefix_vocab,
Exemple #17
0
def build_dawg_from_dumps():
    """ Return a directed acyclic word graph (dawg.DAWG) containing every
        password (listed on a new line) in every file in password_dumps
        directory.
    """
    return dawg.DAWG(get_valid_passwords_from_dumps())
 def __init__(self, tokens):
     self.trie = dawg.DAWG(tokens)
Exemple #19
0
 def test_contains_with_null_bytes(self):
     d = dawg.DAWG(['foo'])
     assert b'foo' in d
     assert b'foo\x00bar' not in d
Exemple #20
0
 def test_build_errors(self):
     with pytest.raises(dawg.Error):
         data = [b'foo\x00bar', b'bar']
         dawg.DAWG(data)
Exemple #21
0
import datetime

start = datetime.datetime.now()
import dawg

words = open('/usr/share/dict/american-english', 'r').read().splitlines()
d = dawg.DAWG(words)
cd = dawg.CompletionDAWG(words)

matrix = []
visited = []
n = int(input())
for i in range(n):
    matrix.append(list(input().split(' ')))
    visited.append([False for x in range(n)])


def do_word_exist(word):
    return rand() > 0.5


def is_in_range(i, j):
    if i < 0 or j < 0:
        return False
    if i >= n or j >= n:
        return False
    return True


mem = {}
def save_dawg(output_file, inventory_manifest):
    import dawg
    s3_objs = (rec.Key for rec in list_inventory(inventory_manifest))
    d = dawg.DAWG(s3_objs)

    d.save(output_file)
Exemple #23
0
 def __init__(self, lang):
     self.dawg = dawg.DAWG()
     self.dawg.load(os.path.join(os.path.dirname(__file__), lang + '.dawg'))
Exemple #24
0
#pip instal DAWG failed
#https://github.com/pytries/DAWG/issues/31
#$ python text_example_dawg.py 
#pip install DAWG-Python
#installs ok, but it is a read-only version of a wrapper to DAWG
#https://pypi.org/project/DAWG-Python/

import time
import timeit
import text_example
import memory_profiler
import dawg # 

if __name__ == "__main__":
    print(("RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0])))
    t2 = time.time()
    words_dawg = dawg.DAWG()
    with open('words_dawg.saved', 'rb') as f:
        words_dawg.read(f)
    t3 = time.time()
    print(t3-t2)
    print(("RAM after load {:0.1f}MiB".format(memory_profiler.memory_usage()[0])))

    assert 'Zwiebel' in words_dawg
    time_cost = sum(timeit.repeat(stmt="u'Zwiebel' in words_dawg",
                                  setup="from __main__ import words_dawg",
                                  number=1,
                                  repeat=10000))
    print(("Summed time to lookup word {:0.4f}s".format(time_cost)))
Exemple #25
0
def create_dawg():
    words = words100k()
    return dawg.DAWG(words)
Exemple #26
0
def load_as_dawg(filepath):
    
    with open(filepath, 'r') as f:
        data = dawg.DAWG(x.rstrip() for x in f.readlines())
    return data
Exemple #27
0
def create_leet_dawg():
    return dawg.DAWG(LEET_50k)
Exemple #28
0
 def test_dawg_prediction(self, word, prediction):
     d = dawg.DAWG(self.DATA)
     assert d.similar_keys(word, self.REPLACES) == prediction
Exemple #29
0
'''
String data in a DAWG may take 200x less memory than in a standard Python dict and the raw lookup speed is comparable;
it also provides fast advanced methods like prefix search.
'''

import dawg
words = [u'foo', u'bar', u'foobar', u'foö', u'bör']
base_dawg = dawg.DAWG(words)
completion_dawg = dawg.CompletionDAWG(words)

print("foo" in base_dawg)
print(completion_dawg.has_keys_with_prefix(u'f'))
print(base_dawg.prefixes(u'foobarz'))
<<<<<<< HEAD
# print(completion_dawg.has(u'f'))
=======
print(completion_dawg.has(u'f'))
>>>>>>> 7aea316fb7211c19240808b49e999c9f2e0561f2
Exemple #30
0
def main(argv):
    nfailed = 0

    arg1 = "words.txt" if not argv else argv[0]

    dog = dawg.DAWG()
    lines = load_lines(arg1)
    nloaded = len(lines)
    log("{} lines loaded".format(nloaded))

    log("filtering lines")
    lines = filter_lines(lines)
    nlines = len(lines)

    insert_suffixes = True
    insert_suffixes = False

    ninserted = 0
    for n, line in enumerate(lines):
        for i in range(1 if not insert_suffixes else len(line)):
            dog.insert(line[i:])
            ninserted += 1

        if (n % 100) == 0 or n + 1 == len(lines):
            sys.stderr.write("inserted line {} of {}\r".format(n + 1, nlines))
            sys.stderr.flush()

    sys.stderr.write("\n")

    log("{} of {} ({}%)) lines inserted".format(ninserted, nlines, 100.0 * ninserted / nlines))

    nfailed += test_dawg(dog, lines)

    dumpf = "dump0.txt"
    log("dump to {}".format(dumpf))
    dog.dump_strings(dumpf)

    log("compress")
    dog.compress()

    nfailed += test_dawg(dog, lines)

    dumpf = "dump1.txt"
    log("dump to {}".format(dumpf))
    dog.dump_strings(dumpf)

    bin_fname = arg1 + ".dawg"
    log("write to binary {}".format(bin_fname))
    dog.write(bin_fname)

    log("read from binary {}".format(bin_fname))
    dog2 = dawg.DAWG.read(bin_fname)

    dumpf = "dump2.txt"
    log("dump binary to {}".format(dumpf))
    dog2.dump_strings(dumpf)

    log("test binary")
    nfailed += test_dawg(dog2, lines)

    return -1 if nfailed else 0