Esempio n. 1
0
def process_vocab((vocab, word_dicts, max_weight)):
    from greek_tools import is_capitalized
    debug = True
    (dict_words, words_clean, words_freq) = word_dicts
    output_string = ''
    euro_sign = unicode(u"\N{EURO SIGN}")
    for wordIn in vocab:
        wordIn_original = wordIn
        wordIn = preprocess_word(wordIn)
        output_words = getCloseWords(wordIn,
                                     word_dicts,
                                     teubner_serif_weights,
                                     max_weight,
                                     threshold=3)
        # If the word doesn't have an exact match, and it is capitalized, then redo with
        # a uncapitalized version
        isCapitalized = False
        hasBeenLowered = False
        if debug:
            print
            print wordIn.encode('utf-8')
        if is_capitalized(wordIn):
            if debug:
                print wordIn.encode('utf-8'), "is capitalized"
            isCapitalized = True
        min_weight = max_weight + 1
        for output_word in output_words:
            if output_word[3] < min_weight:
                min_weight = output_word[3]
        if debug:
            print "minweight is ", min_weight
        if isCapitalized and (len(output_words) == 0
                              or min_weight > max_weight):
            if debug:
                for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                    print word, words_clean[word].encode(
                        'utf-8'), w, lev_distance, words_freq[word]
                print "not found directly, so using", wordIn.lower().encode(
                    'utf-8')
            output_words = getCloseWords(wordIn.lower(),
                                         word_dicts,
                                         teubner_serif_weights,
                                         max_weight,
                                         threshold=3)
            hasBeenLowered = True
        # print
        # print wordIn, ":"
        # If the input word is in the dictionary
        if len(output_words) > 0 and output_words[0][1] == 0:
            if debug:
                print "*"
        else:
            if len(output_words) > 0 and output_words[0][3] < max_weight:
                best_result_word = words_clean[output_words[0][0]]
                if (hasBeenLowered):
                    best_result_word = best_result_word.capitalize()
                if not (best_result_word == wordIn_original
                        or best_result_word == wordIn_original.lower()):
                    output_string += wordIn_original + euro_sign + best_result_word + '\n'
                if debug:
                    dump(wordIn_original)
                    print
                    dump(wordIn)
                    print
                    dump(best_result_word)
            for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                if (hasBeenLowered):
                    word_to_print = word.capitalize()
                else:
                    word_to_print = word
                if debug:
                    print word_to_print, words_clean[word].encode(
                        'utf-8'), w, lev_distance, words_freq[word]
            #		dump(word_to_print)
            #		print
            #		dump(words_clean[word])
            # dump(word)
                if (lev_distance == 0):
                    break
    return output_string
Esempio n. 2
0
def process_vocab((vocab,word_dicts, max_weight)):
    from greek_tools import is_capitalized
    debug = True
    (dict_words, words_clean, words_freq) = word_dicts
    output_string = ''
    euro_sign = unicode(u"\N{EURO SIGN}") 
    for wordIn in vocab:
        wordIn_original = wordIn
        wordIn = preprocess_word(wordIn)
        output_words = getCloseWords(
            wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3)
        # If the word doesn't have an exact match, and it is capitalized, then redo with
        # a uncapitalized version
        isCapitalized = False
        hasBeenLowered = False
        if debug:
            print
            print wordIn.encode('utf-8')
        if is_capitalized(wordIn):
            if debug:
                print wordIn.encode('utf-8'), "is capitalized"
            isCapitalized = True
        min_weight = max_weight + 1
        for output_word in output_words:
            if output_word[3] < min_weight:
                min_weight = output_word[3]
        if debug:
            print "minweight is ", min_weight
        if isCapitalized and (len(output_words) == 0 or min_weight > max_weight):
            if debug:
                for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                    print word, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word]
                print "not found directly, so using", wordIn.lower().encode('utf-8')
            output_words = getCloseWords(wordIn.lower(
            ), word_dicts, teubner_serif_weights, max_weight, threshold=3)
            hasBeenLowered = True
        # print
        # print wordIn, ":"
        # If the input word is in the dictionary
        if len(output_words) > 0 and output_words[0][1] == 0:
            if debug:
                print "*"
        else:
            if len(output_words) > 0 and output_words[0][3] < max_weight:
                best_result_word = words_clean[output_words[0][0]]
                if (hasBeenLowered):
                    best_result_word = best_result_word.capitalize()
                if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()):
                    output_string += wordIn_original + euro_sign + best_result_word  + '\n'
                if debug:
                    dump(wordIn_original)
                    print
                    dump(wordIn)
                    print
                    dump(best_result_word)
            for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                if (hasBeenLowered):
                    word_to_print = word.capitalize()
                else:
                    word_to_print = word
                if debug:
                    print word_to_print, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word]
            #		dump(word_to_print)
            #		print
            #		dump(words_clean[word])
                # dump(word)
                if (lev_distance == 0):
                    break
    return output_string
Esempio n. 3
0
def spellcheck_urls(dict_file,
                    urls,
                    output_file_name,
                    max_weight=9,
                    debug=False):
    from urllib import urlopen
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from itertools import repeat
    from multiprocessing import Pool
    import codecs
    all_tokens = []
    output_file = codecs.open(output_file_name, 'w', 'utf-8')
    #print "numbre of urls: ", len(urls)
    for url in urls:
        raw = urlopen(url).read().decode('utf-8')
        n = 0
        lines = raw.split("\n")
        #if debug:
        #print 'page:', url
        #for line in lines:
        #    print line
        tokens = Dehyphenate(lines)
        #if tokens[-1][-1] = '-':
        #       tokens = tokens[:-1]

        #if debug:
        #    for token in tokens:
        #        print token
        all_tokens = all_tokens + delete_non_greek_tokens(tokens)
    if debug:
        for token in all_tokens:
            print token
    vocab = sorted(set(all_tokens))
    print "vocab of ", len(vocab), " words"
    #for word in vocab:
    #  print word
    vocab = [word for word in vocab if not is_uc_word(word)]
    vocab = [word.rstrip() for word in vocab]
    vocab = [word for word in vocab if not word[-1] == '-']
    vocab = [word for word in vocab if not len(word) == 1]
    print "non-capital words: ", len(vocab)
    if debug:
        print "Are they capitalized?"
        from greek_tools import is_capitalized
        for wordIn in vocab:
            #wordIn = preprocess_word(wordIn)
            print wordIn, is_capitalized(wordIn)
    print "making dicts"
    import time
    start_time = time.time()
    word_dicts = makeDict(dict_file)
    dict_time = time.time() - start_time
    minutes = dict_time / 60.0
    print "dict building took", minutes, " minutes."
    vocab_chunks = list(chunks(vocab, len(vocab) / 8))
    print "vocab is ", len(vocab)
    processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts),
                                 repeat(max_weight))
    print "there are ", len(processed_vocab_chunks), "chunks"
    start_time = time.time()
    # print "dictionary of ", len(dict_words), "words"
    # vocab = [preprocess_word(a_word) for a_word in vocab]
    # why doesn't this trimm all the ones that pass spellcheck?
    # vocab = sorted(set(vocab).difference(set(dict_words)))
    # print "vocab trimmed of dictionary words to ", len(vocab)
    p = Pool(processes=10)
    output = p.map(process_vocab, processed_vocab_chunks)
    for output_chunk in output:
        output_file.write(output_chunk)
    pool_time = time.time() - start_time
    minutes = pool_time / 60.0
    print "processing took", minutes, " minutes"
Esempio n. 4
0
def spellcheck_urls(dict_file, urls, output_file_name, max_weight=9, debug=False):
    from urllib import urlopen
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from itertools import repeat
    from multiprocessing import Pool
    import codecs
    all_tokens = []
    output_file= codecs.open(output_file_name, 'w', 'utf-8')
    #print "numbre of urls: ", len(urls)
    for url in urls:
        raw = urlopen(url).read().decode('utf-8')
        n = 0
        lines = raw.split("\n")
        #if debug:
            #print 'page:', url
            #for line in lines:
            #    print line
        tokens  =  Dehyphenate(lines)
	#if tokens[-1][-1] = '-':
        #       tokens = tokens[:-1]

        #if debug:
        #    for token in tokens:
        #        print token
        all_tokens = all_tokens + delete_non_greek_tokens(tokens)
    if debug:
        for token in all_tokens:
            print token
    vocab = sorted(set(all_tokens))
    print "vocab of ", len(vocab), " words"
    #for word in vocab:
    #  print word
    vocab = [word for word in vocab if not is_uc_word(word)]
    vocab = [word.rstrip() for word in vocab]
    vocab = [word for word in vocab if not  word[-1] == '-']
    vocab = [word for word in vocab if not len(word) == 1 ]
    print "non-capital words: ", len(vocab)
    if debug:
      print "Are they capitalized?"
      from greek_tools import is_capitalized
      for wordIn in vocab:
        #wordIn = preprocess_word(wordIn)
        print wordIn, is_capitalized(wordIn)
    print "making dicts"
    import time
    start_time = time.time()
    word_dicts = makeDict(dict_file)
    dict_time = time.time() - start_time
    minutes = dict_time / 60.0
    print "dict building took", minutes, " minutes."
    vocab_chunks = list(chunks(vocab, len(vocab) / 8))
    print "vocab is ", len(vocab)
    processed_vocab_chunks = zip(vocab_chunks, repeat(word_dicts), repeat(max_weight))
    print "there are ", len(processed_vocab_chunks), "chunks"
    start_time = time.time()
    # print "dictionary of ", len(dict_words), "words"
    # vocab = [preprocess_word(a_word) for a_word in vocab]
    # why doesn't this trimm all the ones that pass spellcheck?
    # vocab = sorted(set(vocab).difference(set(dict_words)))
    # print "vocab trimmed of dictionary words to ", len(vocab)
    p = Pool(processes=10)
    output = p.map(process_vocab,processed_vocab_chunks)
    for output_chunk in output:
        output_file.write(output_chunk)
    pool_time = time.time() - start_time
    minutes = pool_time / 60.0
    print "processing took", minutes, " minutes"