コード例 #1
0
ファイル: suffixes.py プロジェクト: clhuang/BlueMorpho
def genAffixesList(filename):
    suffixes = Counter()
    prefixes = Counter()
    d = fileio.read_wordcounts(filename)
    for word, count in d.items():
        if count < MIN_WORD_FREQ:
            continue
        for x in range(1, len(word)):
            left = word[:x]
            right = word[x:]
            if len(right) <= MAX_AFFIX_LEN and d.get(left, 0) >= MIN_WORD_FREQ:
                suffixes[right] += 1
            if len(left) <= MAX_AFFIX_LEN and d.get(right, 0) >= MIN_WORD_FREQ:
                prefixes[left] += 1

    suffixes = [s[0] for s in suffixes.most_common(100)]
    prefixes = [p[0] for p in prefixes.most_common(100)]
    return suffixes, prefixes
コード例 #2
0
ファイル: __init__.py プロジェクト: clhuang/BlueMorpho
def get_wordlist(size='small', lang='eng'):
    if size == 'full':
        fname = 'data/wordlist-2010.%s.txt' % lang
    else:
        fname = 'data/wordlist-2010.%s.%s.txt' % (lang, size)
    return fileio.read_wordcounts(fname)
コード例 #3
0
ファイル: suffixes.py プロジェクト: clhuang/BlueMorpho
                    prefixes[tags[0]] += 1
    prefixes = [p[0] for p in prefixes.most_common()]
    suffixes = [s[0] for s in suffixes.most_common()]
    return prefixes, suffixes




entr = {'eng': 'en', 'tur': 'tr'}

lang  = 'eng'
size = 'filtered'

filename_w = 'data/wordlist-2010.%s%s.txt' % (lang, '' if size == 'filtered' else size)
filename_v = 'data/%s-wordvectors200_%s.bin' % (entr[lang], size)
wordlist = fileio.read_wordcounts(filename_w)
wordvectors = fileio.load_wordvectors(filename_v, binary=True)

suffixes, prefixes, suffixvector, prefixvector = genAffixesListOpt(wordlist, wordvectors)
print suffixes
print prefixes
with open('data/%s_suffix_list.p' % lang, 'wb') as f:
    pickle.dump(suffixes, f)
with open('data/%s_prefix_list.p' % lang, 'wb') as f:
    pickle.dump(prefixes, f)
with open('data/%s_suffix_wv.p' % lang, 'wb') as f:
    pickle.dump(suffixvector, f)
with open('data/%s_prefix_wv.p' % lang, 'wb') as f:
    pickle.dump(prefixvector, f)