Exemple #1
0
def getCloseWords(wordIn,
                  word_dicts,
                  rules,
                  max_weight,
                  threshold=3,
                  fast=True,
                  debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
        print
        print "getCloseWords for", wordInTrans.encode(
            'utf-8'), "(", wordIn.encode('utf-8'), ")"
        dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
        for word in dict_words:
            # print u"*****" + words_clean[n]
            # print "word into comparison:"
            # print dump(word)
            lev_distance = Levenshtein.distance(
                wordInTrans, word
            )  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
            # print "distance: ",
            # print ratio
            if lev_distance <= threshold:
                edits = Levenshtein.editops(wordInTrans, word)
                w = weight_for_leven_edits(wordInTrans,
                                           word,
                                           edits,
                                           rules,
                                           max_weight,
                                           debug=False)
                output_words.append(
                    (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
                if (lev_distance == 0) and (fast == True):
                    # In the case of an exact match, cut the search short
                    # We might have got some close matches ahead of time, so this
                    # will not create a complete list
                    output_words = sorted(
                        output_words,
                        key=lambda word: int(words_freq[word[0]]))
                    return sorted(output_words, key=lambda word: int(word[3]))
            n = n + 1
    return sorted(output_words, key=lambda word: word[3])
Exemple #2
0
def weight_for_leven_edits(wordFrom, wordTo, edits, weight_rules, max_weight, debug=False):
    if (debug):
        print
        print
        print "Weight Analysis"
        print "word in: ", wordFrom
        dump(wordFrom)
        print
        print "word to: ", wordTo
        dump(wordTo)
    cumulative_weight = 0
    for edit in edits:
        edit_weight = 0
        if (debug):
            print edit
        (command, char_num_in_word_one, char_num_in_word_two) = edit
        if (char_num_in_word_one > (len(wordFrom) - 1)):
            char_in_word_one = ''
        else:
            char_in_word_one = wordFrom[char_num_in_word_one]
        if (char_num_in_word_two > (len(wordTo) - 1)):
            char_in_word_two = ''
        else:
            char_in_word_two = wordTo[char_num_in_word_two]
        if (debug):
            print '\t', command
            if char_in_word_one:
                print '\t', unicodedata.name(char_in_word_one)
            else:
                print '\tx'
            if char_in_word_two:
                print '\t', unicodedata.name(char_in_word_two)
            else:
                print '\tx'
        if (command == 'replace'):
            edit_weight = 10
        elif (command == 'delete'):
            edit_weight = 15
        elif (command == 'insert'):
            edit_weight = 18
        else:
            raise ValueError('unknown Levenshtein edit operation: ' + command)
        for weight_rule in weight_rules:
            if (weight_rule[0] == command) and (weight_rule[1] == '*' or char_in_word_one in weight_rule[1]) and (weight_rule[2] == '*' or char_in_word_two in weight_rule[2]):
                if (debug):
                    print '\t weight rule applied:'
                    print '\t', weight_rule
                edit_weight = weight_rule[3]
                break
        if (debug):
            print '\tweight: ', edit_weight
        cumulative_weight += edit_weight
        if (cumulative_weight >= max_weight):
          break
    return cumulative_weight
Exemple #3
0
def weight_for_leven_edits(wordFrom, wordTo, edits, weight_rules, max_weight, debug=False):
    if (debug):
        print
        print
        print "Weight Analysis"
        print "word in: ", wordFrom
        dump(wordFrom)
        print
        print "word to: ", wordTo
        dump(wordTo)
    cumulative_weight = 0
    for edit in edits:
        edit_weight = 0
        if (debug):
            print edit
        (command, char_num_in_word_one, char_num_in_word_two) = edit
        if (char_num_in_word_one > (len(wordFrom) - 1)):
            char_in_word_one = ''
        else:
            char_in_word_one = wordFrom[char_num_in_word_one]
        if (char_num_in_word_two > (len(wordTo) - 1)):
            char_in_word_two = ''
        else:
            char_in_word_two = wordTo[char_num_in_word_two]
        if (debug):
            print '\t', command
            if char_in_word_one:
                print '\t', unicodedata.name(char_in_word_one)
            else:
                print '\tx'
            if char_in_word_two:
                print '\t', unicodedata.name(char_in_word_two)
            else:
                print '\tx'
        if (command == 'replace'):
            edit_weight = 10
        elif (command == 'delete'):
            edit_weight = 15
        elif (command == 'insert'):
            edit_weight = 18
        else:
            raise ValueError('unknown Levenshtein edit operation: ' + command)
        for weight_rule in weight_rules:
            if (weight_rule[0] == command) and (weight_rule[1] == '*' or char_in_word_one in weight_rule[1]) and (weight_rule[2] == '*' or char_in_word_two in weight_rule[2]):
                if (debug):
                    print '\t weight rule applied:'
                    print '\t', weight_rule
                edit_weight = weight_rule[3]
                break
        if (debug):
            print '\tweight: ', edit_weight
        cumulative_weight += edit_weight
        if (cumulative_weight >= max_weight):
          break
    return cumulative_weight
Exemple #4
0
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
      print
      print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")"
      dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
      for word in dict_words:
          # print u"*****" + words_clean[n]
          # print "word into comparison:"
          # print dump(word)
          lev_distance = Levenshtein.distance(
              wordInTrans, word)  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
          # print "distance: ",
          # print ratio
          if lev_distance <= threshold:
              edits = Levenshtein.editops(wordInTrans, word)
              w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False)
              output_words.append(
                  (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
              if (lev_distance == 0) and (fast == True):
                  # In the case of an exact match, cut the search short
                  # We might have got some close matches ahead of time, so this
                  # will not create a complete list
                  output_words = sorted(
                      output_words, key=lambda word: int(words_freq[word[0]]))
                  return sorted(output_words, key=lambda word: int(word[3]))
          n = n + 1
    return sorted(output_words, key=lambda word: word[3])
Exemple #5
0
def unicode_test(word):
    import unicodedata
    print
    print
    circumflex = unicode(u"\N{COMBINING GREEK PERISPOMENI}")
    other_circumflex = unicode(u"\N{COMBINING CIRCUMFLEX ACCENT}")
    word = word.replace(other_circumflex, circumflex)
    print "*** ", word, ": "
    print "Input: "
    dump(word)
    nfd = unicodedata.normalize('NFD', word)
    if not word == nfd:
        print "The decomposed verison is NOT the same: ",
        print nfd
        dump(nfd)
    else:
        print "(NFD is the same)"
    try:
        cfd = unicodedata.normalize('NFC', word)
    except Error as foo:
        print foo

    print "CFD: ", cfd
    dump(cfd)
Exemple #6
0
def unicode_test(word):
    import unicodedata
    print
    print
    circumflex = unicode(u"\N{COMBINING GREEK PERISPOMENI}")
    other_circumflex = unicode(u"\N{COMBINING CIRCUMFLEX ACCENT}")
    word = word.replace(other_circumflex, circumflex)
    print "*** ", word, ": "
    print "Input: "
    dump(word)
    nfd = unicodedata.normalize('NFD', word)
    if not word == nfd:
        print "The decomposed verison is NOT the same: ",
        print nfd
        dump(nfd)
    else:
        print "(NFD is the same)"
    try:
        cfd = unicodedata.normalize('NFC', word)
    except Error as foo:
        print foo

    print "CFD: ", cfd
    dump(cfd)
Exemple #7
0
def process_vocab((vocab, word_dicts, max_weight)):
    from greek_tools import is_capitalized
    debug = True
    (dict_words, words_clean, words_freq) = word_dicts
    output_string = ''
    euro_sign = unicode(u"\N{EURO SIGN}")
    for wordIn in vocab:
        wordIn_original = wordIn
        wordIn = preprocess_word(wordIn)
        output_words = getCloseWords(wordIn,
                                     word_dicts,
                                     teubner_serif_weights,
                                     max_weight,
                                     threshold=3)
        # If the word doesn't have an exact match, and it is capitalized, then redo with
        # a uncapitalized version
        isCapitalized = False
        hasBeenLowered = False
        if debug:
            print
            print wordIn.encode('utf-8')
        if is_capitalized(wordIn):
            if debug:
                print wordIn.encode('utf-8'), "is capitalized"
            isCapitalized = True
        min_weight = max_weight + 1
        for output_word in output_words:
            if output_word[3] < min_weight:
                min_weight = output_word[3]
        if debug:
            print "minweight is ", min_weight
        if isCapitalized and (len(output_words) == 0
                              or min_weight > max_weight):
            if debug:
                for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                    print word, words_clean[word].encode(
                        'utf-8'), w, lev_distance, words_freq[word]
                print "not found directly, so using", wordIn.lower().encode(
                    'utf-8')
            output_words = getCloseWords(wordIn.lower(),
                                         word_dicts,
                                         teubner_serif_weights,
                                         max_weight,
                                         threshold=3)
            hasBeenLowered = True
        # print
        # print wordIn, ":"
        # If the input word is in the dictionary
        if len(output_words) > 0 and output_words[0][1] == 0:
            if debug:
                print "*"
        else:
            if len(output_words) > 0 and output_words[0][3] < max_weight:
                best_result_word = words_clean[output_words[0][0]]
                if (hasBeenLowered):
                    best_result_word = best_result_word.capitalize()
                if not (best_result_word == wordIn_original
                        or best_result_word == wordIn_original.lower()):
                    output_string += wordIn_original + euro_sign + best_result_word + '\n'
                if debug:
                    dump(wordIn_original)
                    print
                    dump(wordIn)
                    print
                    dump(best_result_word)
            for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                if (hasBeenLowered):
                    word_to_print = word.capitalize()
                else:
                    word_to_print = word
                if debug:
                    print word_to_print, words_clean[word].encode(
                        'utf-8'), w, lev_distance, words_freq[word]
            #		dump(word_to_print)
            #		print
            #		dump(words_clean[word])
            # dump(word)
                if (lev_distance == 0):
                    break
    return output_string
Exemple #8
0
 for word_element in hocr_word_elements:
     #print word_element.text
     try:
         word = unicodedata.normalize('NFD', word_element.text)
     except TypeError:
         word = unicodedata.normalize('NFD', unicode(word_element.text))
     #dump(word)
     parts = split_text_token(word)
     #for part in parts:
     #print '\t',part
     try:
         #print "trying to check", parts[1]
         error_word = parts[1]
         parts = (parts[0], spellcheck_dict[parts[1]], parts[2])
         print "replaced", error_word, "with", parts[1]
         dump(error_word)
         print
         dump(parts[1])
         word_element.set('data-pre-spellcheck', word)
     #  dump(parts[1])
     except KeyError:
         #print "no check"
         pass
 #  print parts[0]+parts[1]+parts[2]
     word_element.text = parts[0] + parts[1] + parts[2]
 fileOut.write(
     html_parser.unescape(
         etree.tostring(treeIn.getroot(),
                        encoding="UTF-8",
                        xml_declaration=True)))
 fileOut.close()
Exemple #9
0
def process_vocab((vocab,word_dicts, max_weight)):
    from greek_tools import is_capitalized
    debug = True
    (dict_words, words_clean, words_freq) = word_dicts
    output_string = ''
    euro_sign = unicode(u"\N{EURO SIGN}") 
    for wordIn in vocab:
        wordIn_original = wordIn
        wordIn = preprocess_word(wordIn)
        output_words = getCloseWords(
            wordIn, word_dicts, teubner_serif_weights, max_weight, threshold=3)
        # If the word doesn't have an exact match, and it is capitalized, then redo with
        # a uncapitalized version
        isCapitalized = False
        hasBeenLowered = False
        if debug:
            print
            print wordIn.encode('utf-8')
        if is_capitalized(wordIn):
            if debug:
                print wordIn.encode('utf-8'), "is capitalized"
            isCapitalized = True
        min_weight = max_weight + 1
        for output_word in output_words:
            if output_word[3] < min_weight:
                min_weight = output_word[3]
        if debug:
            print "minweight is ", min_weight
        if isCapitalized and (len(output_words) == 0 or min_weight > max_weight):
            if debug:
                for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                    print word, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word]
                print "not found directly, so using", wordIn.lower().encode('utf-8')
            output_words = getCloseWords(wordIn.lower(
            ), word_dicts, teubner_serif_weights, max_weight, threshold=3)
            hasBeenLowered = True
        # print
        # print wordIn, ":"
        # If the input word is in the dictionary
        if len(output_words) > 0 and output_words[0][1] == 0:
            if debug:
                print "*"
        else:
            if len(output_words) > 0 and output_words[0][3] < max_weight:
                best_result_word = words_clean[output_words[0][0]]
                if (hasBeenLowered):
                    best_result_word = best_result_word.capitalize()
                if not (best_result_word == wordIn_original or best_result_word == wordIn_original.lower()):
                    output_string += wordIn_original + euro_sign + best_result_word  + '\n'
                if debug:
                    dump(wordIn_original)
                    print
                    dump(wordIn)
                    print
                    dump(best_result_word)
            for word, lev_distance, n, w, junk1, junk2 in output_words[:8]:
                if (hasBeenLowered):
                    word_to_print = word.capitalize()
                else:
                    word_to_print = word
                if debug:
                    print word_to_print, words_clean[word].encode('utf-8'), w, lev_distance, words_freq[word]
            #		dump(word_to_print)
            #		print
            #		dump(words_clean[word])
                # dump(word)
                if (lev_distance == 0):
                    break
    return output_string
                root = treeIn.getroot()
                hocr_word_elements = treeIn.xpath("//html:span[@class='ocr_word'] | //span[@class='ocr_word']",namespaces={'html':"http://www.w3.org/1999/xhtml"})
                for word_element in hocr_word_elements:
                   #print word_element.text
                   try:
                      word = unicodedata.normalize('NFD',word_element.text)
                   except TypeError:
                      word = unicodedata.normalize('NFD',unicode(word_element.text))
                   #dump(word)
                   parts = split_text_token(word)
                   #for part in parts:
                        #print '\t',part
                   try:
                      #print "trying to check", parts[1]
                      error_word = parts[1]
                      parts = (parts[0], spellcheck_dict[parts[1]], parts[2])
                      print "replaced", error_word, "with", parts[1]
                      dump(error_word)
                      print 
                      dump(parts[1])
                      word_element.set('data-pre-spellcheck',word)
                    #  dump(parts[1])
                   except KeyError:
                      #print "no check"
                      pass
                 #  print parts[0]+parts[1]+parts[2]
                   word_element.text = parts[0]+parts[1]+parts[2]
                fileOut.write(html_parser.unescape(etree.tostring(treeIn.getroot(), encoding="UTF-8",xml_declaration=True)))
                fileOut.close()