def generate_wikipedia(c): for line in codecs.open('enwiki-titles', encoding='utf-8'): line = line.strip().upper() if line: if any([ord(ch) > 127 for ch in line]): continue chars = [] space = True parens = 0 for ch in line: if parens: if ch == '(': parens += 1 if ch == ')': parens -= 1 else: if ch in string.uppercase: chars.append(ch) space = False elif ch == '(': parens += 1 elif space == False: chars.append(' ') space = True if chars: text = ''.join(chars) nwords = len(text.split()) alpha = make_alpha(text) emit(c, alpha, text, nwords, 10000*nwords)
def generate_ngram_data(c): for filename in ['1grams.txt', '2grams.txt', '3grams.txt']: for line in codecs.open('ngrams/'+filename, encoding='utf-8'): if line.strip(): words, freq = eval(line) nwords = len(words) if freq >= min_freq: text = ' '.join(words) alpha = make_alpha(text) emit(c, alpha, text, nwords, freq)
def generate_dictionary(c): for line in open('enable1.txt'): if line.strip(): text = line.strip().upper() alpha = make_alpha(text) emit(c, alpha, text, 1, 100)