Python IntCompletionDAWGの例、dawg.IntCompletionDAWG Pythonの例

コード例 #1

0

ファイルを表示

ファイル: buildmodel.py プロジェクト: guanleustc/pwmodel

def create_model(modelfunc, fname='', listw=[], outfname=''):
    """:modelfunc: is a function that takes a word and returns its
    splits.  for ngram model this function returns all the ngrams of a
    word, for PCFG it will return te split of the password. So, it
    takes a string and returns a list of strings

    """
    pws = []
    if fname:
        pws = helper.open_get_line(fname)

    def join_iterators(_pws, listw):
        for p in _pws:
            yield p
        for p in listw:
            yield p

    big_dict = defaultdict(int)
    for pw, c in join_iterators(pws, listw):
        for ng in modelfunc(pw):
            big_dict[unicode(ng)] += c
    big_dict['__TOTAL__'] = sum(big_dict.values())
    nDawg = dawg.IntCompletionDAWG(big_dict)
    if not outfname:
        outfname = 'tmpmodel.dawg'
    nDawg.save(outfname)
    return nDawg

コード例 #2

0

ファイルを表示

ファイル: entitydb.py プロジェクト: zseder/hunmisc

 def load_from_files(pickle_fn, dawg_fn, prefix_dawg_fn):
     entity_db = cPickle.load(open(pickle_fn, "rb"))
     entity_db.dawg = dawg.IntCompletionDAWG()
     entity_db.dawg.load(dawg_fn)
     entity_db.long_entities = dawg.IntDAWG()
     entity_db.long_entities.load(prefix_dawg_fn)
     return entity_db

コード例 #3

0

ファイルを表示

def build_test_data():

    dawg.CompletionDAWG(['f', 'bar', 'foo',
                         'foobar']).save('dev_data/small/completion.dawg')
    dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg')

    bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'),
                  ('foobar', b'data4'))
    dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg')

    record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)),
                   ('foo', (3, 2, 1)), ('foobar', (6, 3, 0)))
    dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg')

    int_data = {'foo': 1, 'bar': 5, 'foobar': 3}
    dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg')
    dawg.IntCompletionDAWG(int_data).save(
        'dev_data/small/int_completion_dawg.dawg')

    dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg')
    dawg.RecordDAWG(str("=H"), [
        (k, (len(k), )) for k in TestPrediction.DATA
    ]).save('dev_data/small/prediction-record.dawg')

    create_dawg().save('dev_data/large/dawg.dawg')
    create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg')
    create_record_dawg().save('dev_data/large/record_dawg.dawg')
    create_int_dawg().save('dev_data/large/int_dawg.dawg')

コード例 #4

0

ファイルを表示

def create_model(modelfunc, fname='', listw=[], outfname='',
                 limit=int(3e6), min_pwlen=6, topk=10000, sep=r'\s+'):
    """:modelfunc: is a function that takes a word and returns its
    splits.  for ngram model this function returns all the ngrams of a
    word, for PCFG it will return splits of the password.
    @modelfunc: func: string -> [list of strings]
    @fname: name of the file to read from
    @listw: list of passwords. Used passwords from both the files and
            listw if provided.
    @outfname: the file to write down the model.
    """

    def length_filter(pw):
        pw = ''.join(c for c in pw if c in VALID_CHARS)
        return len(pw) >= min_pwlen

    pws = []
    if fname:
        pws = helper.open_get_line(fname, limit=limit, pw_filter=length_filter, sep=sep)

    big_dict = defaultdict(int)
    total_f, total_e = 0, 0
    # Add topk passwords from the input dataset to the list
    topk_pws = []
    for pw, c in itertools.chain(pws, listw):
        for ng in modelfunc(pw):
            big_dict[ng] += c
        total_f += c
        total_e += 1
        if len(big_dict) % 100000 == 0:
            print(("Dictionary size: {} (Total_freq: {}; Total_pws: {}"\
                   .format(len(big_dict), total_f, total_e)))
        if len(topk_pws) >= topk:
            heapq.heappushpop(topk_pws, (c, pw))
        else:
            heapq.heappush(topk_pws, (c, pw))
    # Adding topk password to deal with probability reduction of popular
    # passwords. Mostly effective for n-gram models
    print("topk={}".format(topk))
    if topk > 0:
        for c, pw in topk_pws:
            tpw = helper.START + pw + helper.END
            big_dict[tpw] += c
            total_f += c
            total_e += 1

    big_dict[NPWS_W] = total_e
    big_dict[TOTALF_W] = total_f

    nDawg = dawg.IntCompletionDAWG(big_dict)
    if not outfname:
        outfname = 'tmpmodel.dawg.gz'
    elif not outfname.endswith('.gz'):
        outfname += '.gz'
    pathlib.Path(outfname).parent.mkdir(parents=True, exist_ok=True)
    helper.save_dawg(nDawg, outfname)
    return nDawg

コード例 #5

0

ファイルを表示

ファイル: entitydb.py プロジェクト: zseder/hunmisc

    def finalize(self):
        for cache in self.caches.itervalues():
            cache.finalize()
        self.value_cache.finalize()

        logging.info("Finalizing values...")
        self.finalize_values()

        logging.info("Creating main dawg...")
        self.dawg = dawg.IntCompletionDAWG(self.d)
        del self.d

        self.finalize_long_entities()

        logging.info("finalizing done.")

コード例 #6

0

ファイルを表示

class RouterView(object):
    """
    Route all HTTP requests to the corresponding view.
    """

    KLASS_EXCLUSIONS = {
        HTMLTemplateView, VulnerableTemplateView, StaticFileView,
        FormTemplateView, RawPathTemplateView
    }
    DIR_EXCLUSIONS = set()
    FILE_EXCLUSIONS = {'__init__.py'}

    def __init__(self):
        self._plugin_families = set(get_plugin_families())

        # Will be generated on autoregister
        self._mapping = None
        self._view_instances = []

        self._view_files = []
        self._autoregister()

    def _autoregister(self):
        """
        We go through the moth/views/ directory, importing all the modules
        and finding subclasses of VulnerableTemplateView. When we find one, we
        get the URL pattern from it, create an instance and call _register.
        
        :return: None, calls _register which stores the info in _mapping.
        """
        data = []

        for fname in self._get_vuln_view_files(
                self._get_vuln_view_directory()):
            for klass in self._get_views_from_file(fname):
                try:
                    view_obj = klass()
                except Exception, e:
                    msg = 'An exception occured while trying to register %s: "%s"'
                    raise RuntimeError(msg % (view_obj, e))
                else:
                    self._view_instances.append(view_obj)
                    view_index = len(self._view_instances) - 1
                    data.append((view_obj.get_unicode_url_path(), view_index))

        self._mapping = dawg.IntCompletionDAWG(data)

コード例 #7

0

ファイルを表示

ファイル: models.py プロジェクト: guanleustc/pwmodel

def create_model(modelfunc, fname='', listw=[], outfname=''):
    """:modelfunc: is a function that takes a word and returns its
    splits.  for ngram model this function returns all the ngrams of a
    word, for PCFG it will return te split of the password. 
    @modelfunc: func: string -> [list of strings]
    @fname: name of the file to read from
    @listw: list of passwords. Used passwords from both the files and 
            listw if provided. 
    @outfname: the file to write down the model.
    """
    pws = []
    if fname:
        pws = helper.open_get_line(fname, limit=3e6)

    def join_iterators(_pws, listw):
        for p in _pws:
            yield p
        for p in listw:
            yield p

    big_dict = defaultdict(int)
    total_f, total_e = 0, 0
    for pw, c in join_iterators(pws, listw):
        for ng in modelfunc(pw):
            big_dict[ng] += c
        if len(big_dict) % 100000 == 0:
            print("Dictionary size: {}".format(len(big_dict)))
        total_f += c
        total_e += 1
    big_dict['__TOTAL__'] = total_e
    big_dict['__TOTALF__'] = total_f

    nDawg = dawg.IntCompletionDAWG(big_dict)
    if not outfname:
        outfname = 'tmpmodel.dawg'
    nDawg.save(outfname)
    return nDawg

コード例 #8

0

ファイルを表示

ファイル: Feature_generator.py プロジェクト: sebastiaansch/InformationRetrievalQ3

import numpy as np
import datetime
import re
import time
import dawg
from nltk import ngrams
from heapq import nlargest
from operator import itemgetter

with open('../data/ngram_dict.csv', mode='r') as infile:
    reader = csv.reader(infile)
    #     ngram_dict = {rows[0]:rows[1] for rows in reader}
    ngram_data = [
        tuple([line[0], int(line[1])]) for line in reader if int(line[1]) > 2
    ]
ngram_dict = dawg.IntCompletionDAWG(ngram_data)
# prefix_suffix_pairs_background = pd.read_csv("../data/prefix_suffix_pairs.txt")

# Importing all the necessary dictionaries
suffixes = pd.read_csv("../data/Freq_background.csv", index_col='Unnamed: 0')
with open('../data/sorted_popular_queries.csv', mode='r') as infile:
    reader = csv.reader(infile)
    skipheader = next(reader)
    data = [
        tuple([line[1], int(line[2])]) for line in reader if int(line[2]) > 2
    ]
sortedpopulardict = dawg.IntCompletionDAWG(data)

data = []
ngram_data = []

コード例 #9

0

ファイルを表示

ファイル: helper.py プロジェクト: rchatterjee/Passwords

        return list(itertools.chain(*p))
    else:
        return wrap_func((func, data))
    

def diff(oldG, newG):
    """
    returns the difference of the two grammars.
    """
    if not (isinstance(oldG, dict) and isinstance(newG, dict)):
        yield (oldG, newG)
    else:
        for k in oldG.keys():
            if k not in newG:
                yield k
            else:
                vold, vnew = oldG[k], newG[k]
                if vold != vnew:
                    diff(oldG[k], newG[k])
    

if __name__=='__main__':
    import dawg
    pws = dict(get_line(open_(sys.argv[1])), lim=1e7)
    new_fname = sys.argv[1].replace('.tar.bz', '.dawg')
    if new_fname == sys.argv[1]:
        new_fname = sys.argv[1].split('.', 1)[0] + '.dawg'
    assert new_fname != sys.argv[1], "Give a better name to your original file."
    T = dawg.IntCompletionDAWG(pws.items())
    T.save('{}'.format(new_fname))

コード例 #10

0

ファイルを表示

ファイル: test_dawg.py プロジェクト: yyht/DAWG

 def test_no_segfaults_on_empty_dawg(self):
     d = dawg.IntCompletionDAWG([])
     assert d.keys() == []

コード例 #11

0

ファイルを表示

ファイル: test_dawg.py プロジェクト: yyht/DAWG

 def empty_dawg(self):
     return dawg.IntCompletionDAWG()

コード例 #12

0

ファイルを表示

ファイル: test_dawg.py プロジェクト: yyht/DAWG

 def dawg(self):
     return dawg.IntCompletionDAWG((k, len(k)) for k in self.keys)

コード例 #13

0

ファイルを表示

ファイル: models.py プロジェクト: guanleustc/pwmodel

def read_dawg(fname):
    print("reading {fname}".format(fname=fname))
    return dawg.IntCompletionDAWG(fname).load(fname)

コード例 #14

0

ファイルを表示

ファイル: buildmodel.py プロジェクト: guanleustc/pwmodel

def read_dawg(fname):
    nDawg = dawg.IntCompletionDAWG(fname)
    nDawg.load(fname)
    return nDawg

コード例 #15

0

ファイルを表示

def create_int_completion_dawg():
    words = words100k()
    values = [len(word) for word in words]
    return dawg.IntCompletionDAWG(zip(words, values))

コード例 #16

0

ファイルを表示

ファイル: ngram_recommend.py プロジェクト: zofuthan/seldon-server

 def create_trie(self):
     """
     Create a DAWG from the keys for fast prefix retrieval
     """
     self.dawg = dawg.IntCompletionDAWG(self.keys)