def genAffixesList(filename): suffixes = Counter() prefixes = Counter() d = fileio.read_wordcounts(filename) for word, count in d.items(): if count < MIN_WORD_FREQ: continue for x in range(1, len(word)): left = word[:x] right = word[x:] if len(right) <= MAX_AFFIX_LEN and d.get(left, 0) >= MIN_WORD_FREQ: suffixes[right] += 1 if len(left) <= MAX_AFFIX_LEN and d.get(right, 0) >= MIN_WORD_FREQ: prefixes[left] += 1 suffixes = [s[0] for s in suffixes.most_common(100)] prefixes = [p[0] for p in prefixes.most_common(100)] return suffixes, prefixes
def get_wordlist(size='small', lang='eng'): if size == 'full': fname = 'data/wordlist-2010.%s.txt' % lang else: fname = 'data/wordlist-2010.%s.%s.txt' % (lang, size) return fileio.read_wordcounts(fname)
prefixes[tags[0]] += 1 prefixes = [p[0] for p in prefixes.most_common()] suffixes = [s[0] for s in suffixes.most_common()] return prefixes, suffixes entr = {'eng': 'en', 'tur': 'tr'} lang = 'eng' size = 'filtered' filename_w = 'data/wordlist-2010.%s%s.txt' % (lang, '' if size == 'filtered' else size) filename_v = 'data/%s-wordvectors200_%s.bin' % (entr[lang], size) wordlist = fileio.read_wordcounts(filename_w) wordvectors = fileio.load_wordvectors(filename_v, binary=True) suffixes, prefixes, suffixvector, prefixvector = genAffixesListOpt(wordlist, wordvectors) print suffixes print prefixes with open('data/%s_suffix_list.p' % lang, 'wb') as f: pickle.dump(suffixes, f) with open('data/%s_prefix_list.p' % lang, 'wb') as f: pickle.dump(prefixes, f) with open('data/%s_suffix_wv.p' % lang, 'wb') as f: pickle.dump(suffixvector, f) with open('data/%s_prefix_wv.p' % lang, 'wb') as f: pickle.dump(prefixvector, f)