def get_wordvectors(size='small', lang='en'): if lang == 'eng': lang = 'en' if lang == 'tur': lang = 'tr' if size == 'full': size = 'filtered' file_v = 'data/%s-wordvectors200_%s.txt' % (lang, size) binfile_v = file_v[:-3] + 'bin' if os.path.isfile(binfile_v): return fileio.load_wordvectors(binfile_v, binary=True) return fileio.load_wordvectors(file_v)
prefixes = [p[0] for p in prefixes.most_common()] suffixes = [s[0] for s in suffixes.most_common()] return prefixes, suffixes entr = {'eng': 'en', 'tur': 'tr'} lang = 'eng' size = 'filtered' filename_w = 'data/wordlist-2010.%s%s.txt' % (lang, '' if size == 'filtered' else size) filename_v = 'data/%s-wordvectors200_%s.bin' % (entr[lang], size) wordlist = fileio.read_wordcounts(filename_w) wordvectors = fileio.load_wordvectors(filename_v, binary=True) suffixes, prefixes, suffixvector, prefixvector = genAffixesListOpt(wordlist, wordvectors) print suffixes print prefixes with open('data/%s_suffix_list.p' % lang, 'wb') as f: pickle.dump(suffixes, f) with open('data/%s_prefix_list.p' % lang, 'wb') as f: pickle.dump(prefixes, f) with open('data/%s_suffix_wv.p' % lang, 'wb') as f: pickle.dump(suffixvector, f) with open('data/%s_prefix_wv.p' % lang, 'wb') as f: pickle.dump(prefixvector, f) #with open('data/%s_suffix_list_gold.p' % lang, 'wb') as f: