Esempio n. 1
0
def get_wordvectors(size='small', lang='en'):
    if lang == 'eng':
        lang = 'en'
    if lang == 'tur':
        lang = 'tr'
    if size == 'full':
        size = 'filtered'
    file_v = 'data/%s-wordvectors200_%s.txt' % (lang, size)
    binfile_v = file_v[:-3] + 'bin'
    if os.path.isfile(binfile_v):
        return fileio.load_wordvectors(binfile_v, binary=True)
    return fileio.load_wordvectors(file_v)
Esempio n. 2
0
    prefixes = [p[0] for p in prefixes.most_common()]
    suffixes = [s[0] for s in suffixes.most_common()]
    return prefixes, suffixes




entr = {'eng': 'en', 'tur': 'tr'}

lang  = 'eng'
size = 'filtered'

filename_w = 'data/wordlist-2010.%s%s.txt' % (lang, '' if size == 'filtered' else size)
filename_v = 'data/%s-wordvectors200_%s.bin' % (entr[lang], size)
wordlist = fileio.read_wordcounts(filename_w)
wordvectors = fileio.load_wordvectors(filename_v, binary=True)

suffixes, prefixes, suffixvector, prefixvector = genAffixesListOpt(wordlist, wordvectors)
print suffixes
print prefixes
with open('data/%s_suffix_list.p' % lang, 'wb') as f:
    pickle.dump(suffixes, f)
with open('data/%s_prefix_list.p' % lang, 'wb') as f:
    pickle.dump(prefixes, f)
with open('data/%s_suffix_wv.p' % lang, 'wb') as f:
    pickle.dump(suffixvector, f)
with open('data/%s_prefix_wv.p' % lang, 'wb') as f:
    pickle.dump(prefixvector, f)


#with open('data/%s_suffix_list_gold.p' % lang, 'wb') as f: