コード例 #1
0
def create_vocabulary(ngram=1, test=False):
    """ Creates  the vocabulary for ngram level

    :param ngram:
    :param test: If true, only runs through the first 10000 documents.
    :return:

    Steps:
    - Get a set of all tokens
    - Retain only the valid ones
    """

    add_valid_words()
    # get a set of all tokens of the ngram level

    print("here")

    token_set = get_all_tokens_in_docs(ngram, test)
    print("Total tokens before merging: ", len(token_set))

    valid_iterator = valid_ngram_iterator(token_set, ngram)

    vocabulary_trie = Trie(valid_iterator)
    vocabulary_trie.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))
    print("Total tokens after merging", len(vocabulary_trie))
コード例 #2
0
ファイル: loadDic.py プロジェクト: edithli/nocrack
def loadTrie( fname ):
    global trie
    try:
        fname = fname + "_trie.hny"
        trie.load( fname )
    except(IOError):
        f = bz2.BZ2File( dir_path + sys.argv[1]);
        words = [ w.strip() for w in f.readlines() ]
        trie = Trie(words);
        trie.save(fname);
def add_terms():


    for ngram in range(1,3):
        # update vocabulary trie
        # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore
        start = time.time()
        vocabulary = load_vocabulary_trie(ngram)
        keys = vocabulary.keys() + ADDED_TOKENS[ngram]
        vocabulary_new = Trie(keys)
        vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram))

        full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram]))
        print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
コード例 #4
0
def load_password_blacklist():
    global password_blackList
    if conf.password_blackList == 'NOBLACKLIST':
        LOGGER.warning('No password blacklist file defined.')
        password_blackList = Trie()
        return

    if os.path.isfile('compiledPwdBlacklist.bin'):
        LOGGER.info('Loading pre-compiled password blacklist...')
        password_blackList = Trie()
        password_blackList.load('compiledPwdBlacklist.bin')

    else:
        try:
            LOGGER.info('Compiling password blacklist...')
            with open(conf.password_blackList, encoding="utf-8") as f:
                pwds = f.read().splitlines()
                password_blackList = Trie(pwds)
            password_blackList.save('compiledPwdBlacklist.bin')
        except FileNotFoundError:
            LOGGER.error('File ' + conf.password_blackList +
                         ' not found. Aborting.')
            exit(-1)
コード例 #5
0
 def craft_index(wordlist: List[str], output_dir: Path) -> Path:
     """Generate the special file "words" that is an index of all words."""
     output = output_dir / "words"
     trie = Trie(wordlist)
     trie.save(output)
     return output
コード例 #6
0
# coding: utf-8
import sys
from marisa_trie import Trie

patterns = []
for line in iter(sys.stdin.readline, ""):
    #if not isinstance(line, unicode):
    #    line = line.decode('utf-8')
    ptn = line.strip().replace('_', ' ').lower()
    if len(line) == 0:
        continue
    patterns.append(ptn)

trie = Trie(patterns)
trie.save('triedict')