def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False): from urllib import urlopen import nltk from nltk.tokenize import RegexpTokenizer from itertools import repeat from multiprocessing import Pool import codecs all_tokens = [] output_file = codecs.open(output_file_name, 'w', 'utf-8') #print "numbre of urls: ", len(urls) for url in urls: raw = urlopen(url).read().decode('utf-8') n = 0 lines = raw.split("\n") #if debug: #print 'page:', url #for line in lines: # print line tokens = Dehyphenate(lines) #if tokens[-1][-1] = '-': # tokens = tokens[:-1] #if debug: # for token in tokens: # print token all_tokens = all_tokens + delete_non_greek_tokens(tokens) if debug: for token in all_tokens: print token vocab = sorted(set(all_tokens)) print "vocab of ", len(vocab), " words" #for word in vocab: # print word vocab = [word for word in vocab if not is_uc_word(word)] vocab = [word.rstrip() for word in vocab] vocab = [word for word in vocab if not word[-1] == '-'] vocab = [word for word in vocab if not len(word) == 1] print "non-capital words: ", len(vocab) if debug: print "Are they capitalized?" from greek_tools import is_capitalized for wordIn in vocab: #wordIn = preprocess_word(wordIn) print wordIn, is_capitalized(wordIn) print "making dicts" import time start_time = time.time() word_dicts = makeDict(dict_file) dict_time = time.time() - start_time minutes = dict_time / 60.0 print "dict building took", minutes, " minutes." vocab_chunks = list(chunks(vocab, len(vocab) / 8)) print "vocab is ", len(vocab) processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight)) print "there are ", len(processed_vocab_chunks), "chunks" start_time = time.time() # print "dictionary of ", len(dict_words), "words" # vocab = [preprocess_word(a_word) for a_word in vocab] # why doesn't this trimm all the ones that pass spellcheck? # vocab = sorted(set(vocab).difference(set(dict_words))) # print "vocab trimmed of dictionary words to ", len(vocab) p = Pool(processes=10) output = p.map(process_vocab, processed_vocab_chunks) for output_chunk in output: output_file.write(output_chunk) pool_time = time.time() - start_time minutes = pool_time / 60.0 print "processing took", minutes, " minutes"
def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False): from urllib import urlopen import nltk from nltk.tokenize import RegexpTokenizer from itertools import repeat from multiprocessing import Pool import codecs all_tokens = [] output_file= codecs.open(output_file_name, 'w', 'utf-8') #print "numbre of urls: ", len(urls) for url in urls: raw = urlopen(url).read().decode('utf-8') n = 0 lines = raw.split("\n") #if debug: #print 'page:', url #for line in lines: # print line tokens = Dehyphenate(lines) #if tokens[-1][-1] = '-': # tokens = tokens[:-1] #if debug: # for token in tokens: # print token all_tokens = all_tokens + delete_non_greek_tokens(tokens) if debug: for token in all_tokens: print token vocab = sorted(set(all_tokens)) print "vocab of ", len(vocab), " words" #for word in vocab: # print word vocab = [word for word in vocab if not is_uc_word(word)] vocab = [word.rstrip() for word in vocab] vocab = [word for word in vocab if not word[-1] == '-'] vocab = [word for word in vocab if not len(word) == 1 ] print "non-capital words: ", len(vocab) if debug: print "Are they capitalized?" from greek_tools import is_capitalized for wordIn in vocab: #wordIn = preprocess_word(wordIn) print wordIn, is_capitalized(wordIn) print "making dicts" import time start_time = time.time() word_dicts = makeDict(dict_file) dict_time = time.time() - start_time minutes = dict_time / 60.0 print "dict building took", minutes, " minutes." vocab_chunks = list(chunks(vocab, len(vocab) / 8)) print "vocab is ", len(vocab) processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight)) print "there are ", len(processed_vocab_chunks), "chunks" start_time = time.time() # print "dictionary of ", len(dict_words), "words" # vocab = [preprocess_word(a_word) for a_word in vocab] # why doesn't this trimm all the ones that pass spellcheck? # vocab = sorted(set(vocab).difference(set(dict_words))) # print "vocab trimmed of dictionary words to ", len(vocab) p = Pool(processes=10) output = p.map(process_vocab,processed_vocab_chunks) for output_chunk in output: output_file.write(output_chunk) pool_time = time.time() - start_time minutes = pool_time / 60.0 print "processing took", minutes, " minutes"
# print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) # for token in line_tokens: # print token.encode('utf-8'), " | " # n = n + 1 text_array.append(line_tokens) # now try to match hyphenated lines with their # correpsonding beginning lines n = 0 for line in text_array: if len(line) > 0: if line[-1][-1] == "-": try: line[-1] = line[-1][:-1] + text_array[n + 1][0] text_array[n + 1] = text_array[n + 1][1:] except IndexError as e: print e n = n + 1 # now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] tokens = delete_non_greek_tokens(tokens) for token in tokens: fdist.inc(token) print "most common: ", fdist.max().encode("utf-8") for item in fdist.keys(): print item.encode("utf-8"), fdist.freq(item)
#print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) #for token in line_tokens: # print token.encode('utf-8'), " | " #n = n + 1 text_array.append(line_tokens) #now try to match hyphenated lines with their #correpsonding beginning lines n = 0 for line in text_array: if len(line) > 0: if line[-1][-1] == '-': try: line[-1] = line[-1][:-1] + text_array[n + 1][0] text_array[n + 1] = text_array[n + 1][1:] except IndexError as e: print e n = n + 1 #now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] tokens = delete_non_greek_tokens(tokens) for token in tokens: fdist.inc(token) print "most common: ", fdist.max().encode('utf-8') for item in fdist.keys(): print item.encode('utf-8'), fdist.freq(item)