def process_vocab((vocab, word_dicts, max_weight)): from greek_tools import is_capitalized debug = True (dict_words, words_clean, words_freq) = word_dicts output_string = '' euro_sign = unicode(u"\N{EURO SIGN}") for wordIn in vocab: wordIn_original = wordIn wordIn = preprocess_word(wordIn) output_words = getCloseWords(wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3) # If the word doesn't have an exact match, and it is capitalized, then redo with # a uncapitalized version isCapitalized = False hasBeenLowered = False if debug: print print wordIn.encode('utf-8') if is_capitalized(wordIn): if debug: print wordIn.encode('utf-8'), "is capitalized" isCapitalized = True min_weight = max_weight + 1 for output_word in output_words: if output_word[3] < min_weight: min_weight = output_word[3] if debug: print "minweight is ", min_weight if isCapitalized and (len(output_words) == 0 or min_weight > max_weight): if debug: for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: print word, words_clean[word].encode( 'utf-8'), w, lev_distance, words_freq[word] print "not found directly, so using", wordIn.lower().encode( 'utf-8') output_words = getCloseWords(wordIn.lower(), word_dicts, teubner_serif_weights, max_weight, threshold=3) hasBeenLowered = True # print # print wordIn, ":" # If the input word is in the dictionary if len(output_words) > 0 and output_words[0][1] == 0: if debug: print "*" else: if len(output_words) > 0 and output_words[0][3] < max_weight: best_result_word = words_clean[output_words[0][0]] if (hasBeenLowered): best_result_word = best_result_word.capitalize() if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()): output_string += wordIn_original + euro_sign + best_result_word + '\n' if debug: dump(wordIn_original) print dump(wordIn) print dump(best_result_word) for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: if (hasBeenLowered): word_to_print = word.capitalize() else: word_to_print = word if debug: print word_to_print, words_clean[word].encode( 'utf-8'), w, lev_distance, words_freq[word] # dump(word_to_print) # print # dump(words_clean[word]) # dump(word) if (lev_distance == 0): break return output_string
def process_vocab((vocab,word_dicts, max_weight)): from greek_tools import is_capitalized debug = True (dict_words, words_clean, words_freq) = word_dicts output_string = '' euro_sign = unicode(u"\N{EURO SIGN}") for wordIn in vocab: wordIn_original = wordIn wordIn = preprocess_word(wordIn) output_words = getCloseWords( wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3) # If the word doesn't have an exact match, and it is capitalized, then redo with # a uncapitalized version isCapitalized = False hasBeenLowered = False if debug: print print wordIn.encode('utf-8') if is_capitalized(wordIn): if debug: print wordIn.encode('utf-8'), "is capitalized" isCapitalized = True min_weight = max_weight + 1 for output_word in output_words: if output_word[3] < min_weight: min_weight = output_word[3] if debug: print "minweight is ", min_weight if isCapitalized and (len(output_words) == 0 or min_weight > max_weight): if debug: for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: print word, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word] print "not found directly, so using", wordIn.lower().encode('utf-8') output_words = getCloseWords(wordIn.lower( ), word_dicts, teubner_serif_weights, max_weight, threshold=3) hasBeenLowered = True # print # print wordIn, ":" # If the input word is in the dictionary if len(output_words) > 0 and output_words[0][1] == 0: if debug: print "*" else: if len(output_words) > 0 and output_words[0][3] < max_weight: best_result_word = words_clean[output_words[0][0]] if (hasBeenLowered): best_result_word = best_result_word.capitalize() if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()): output_string += wordIn_original + euro_sign + best_result_word + '\n' if debug: dump(wordIn_original) print dump(wordIn) print dump(best_result_word) for word, lev_distance, n, w, junk1, junk2 in output_words[:8]: if (hasBeenLowered): word_to_print = word.capitalize() else: word_to_print = word if debug: print word_to_print, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word] # dump(word_to_print) # print # dump(words_clean[word]) # dump(word) if (lev_distance == 0): break return output_string
def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False): from urllib import urlopen import nltk from nltk.tokenize import RegexpTokenizer from itertools import repeat from multiprocessing import Pool import codecs all_tokens = [] output_file = codecs.open(output_file_name, 'w', 'utf-8') #print "numbre of urls: ", len(urls) for url in urls: raw = urlopen(url).read().decode('utf-8') n = 0 lines = raw.split("\n") #if debug: #print 'page:', url #for line in lines: # print line tokens = Dehyphenate(lines) #if tokens[-1][-1] = '-': # tokens = tokens[:-1] #if debug: # for token in tokens: # print token all_tokens = all_tokens + delete_non_greek_tokens(tokens) if debug: for token in all_tokens: print token vocab = sorted(set(all_tokens)) print "vocab of ", len(vocab), " words" #for word in vocab: # print word vocab = [word for word in vocab if not is_uc_word(word)] vocab = [word.rstrip() for word in vocab] vocab = [word for word in vocab if not word[-1] == '-'] vocab = [word for word in vocab if not len(word) == 1] print "non-capital words: ", len(vocab) if debug: print "Are they capitalized?" from greek_tools import is_capitalized for wordIn in vocab: #wordIn = preprocess_word(wordIn) print wordIn, is_capitalized(wordIn) print "making dicts" import time start_time = time.time() word_dicts = makeDict(dict_file) dict_time = time.time() - start_time minutes = dict_time / 60.0 print "dict building took", minutes, " minutes." vocab_chunks = list(chunks(vocab, len(vocab) / 8)) print "vocab is ", len(vocab) processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight)) print "there are ", len(processed_vocab_chunks), "chunks" start_time = time.time() # print "dictionary of ", len(dict_words), "words" # vocab = [preprocess_word(a_word) for a_word in vocab] # why doesn't this trimm all the ones that pass spellcheck? # vocab = sorted(set(vocab).difference(set(dict_words))) # print "vocab trimmed of dictionary words to ", len(vocab) p = Pool(processes=10) output = p.map(process_vocab, processed_vocab_chunks) for output_chunk in output: output_file.write(output_chunk) pool_time = time.time() - start_time minutes = pool_time / 60.0 print "processing took", minutes, " minutes"
def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False): from urllib import urlopen import nltk from nltk.tokenize import RegexpTokenizer from itertools import repeat from multiprocessing import Pool import codecs all_tokens = [] output_file= codecs.open(output_file_name, 'w', 'utf-8') #print "numbre of urls: ", len(urls) for url in urls: raw = urlopen(url).read().decode('utf-8') n = 0 lines = raw.split("\n") #if debug: #print 'page:', url #for line in lines: # print line tokens = Dehyphenate(lines) #if tokens[-1][-1] = '-': # tokens = tokens[:-1] #if debug: # for token in tokens: # print token all_tokens = all_tokens + delete_non_greek_tokens(tokens) if debug: for token in all_tokens: print token vocab = sorted(set(all_tokens)) print "vocab of ", len(vocab), " words" #for word in vocab: # print word vocab = [word for word in vocab if not is_uc_word(word)] vocab = [word.rstrip() for word in vocab] vocab = [word for word in vocab if not word[-1] == '-'] vocab = [word for word in vocab if not len(word) == 1 ] print "non-capital words: ", len(vocab) if debug: print "Are they capitalized?" from greek_tools import is_capitalized for wordIn in vocab: #wordIn = preprocess_word(wordIn) print wordIn, is_capitalized(wordIn) print "making dicts" import time start_time = time.time() word_dicts = makeDict(dict_file) dict_time = time.time() - start_time minutes = dict_time / 60.0 print "dict building took", minutes, " minutes." vocab_chunks = list(chunks(vocab, len(vocab) / 8)) print "vocab is ", len(vocab) processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight)) print "there are ", len(processed_vocab_chunks), "chunks" start_time = time.time() # print "dictionary of ", len(dict_words), "words" # vocab = [preprocess_word(a_word) for a_word in vocab] # why doesn't this trimm all the ones that pass spellcheck? # vocab = sorted(set(vocab).difference(set(dict_words))) # print "vocab trimmed of dictionary words to ", len(vocab) p = Pool(processes=10) output = p.map(process_vocab,processed_vocab_chunks) for output_chunk in output: output_file.write(output_chunk) pool_time = time.time() - start_time minutes = pool_time / 60.0 print "processing took", minutes, " minutes"