Ejemplo n.º 1
0
def makeDict(fileName):
    words_transformed = []
    words_clean = {}
    words_freq = {}
    n = 0
    mine = codecs.open(fileName, 'r', 'utf-8')
    for line in mine:
        # wordsInLine = line.split()
        # print len(wordsInLine)
        # for word in wordsInLine:
        (word, freq) = line.split(',')
        # print len(wordsInLine)
        # for word in wordsInLine:
        freq = int(freq.rstrip('\r\n'))
        word = preprocess_word(word.rstrip('\n\r' + r'\x11'))
        thisWord = leven.transIn(word)
        words_transformed.append(thisWord)
        words_clean[thisWord] = word
        words_freq[thisWord] = freq
    # print "dictionary: ",len(words_transformed), " words."
    # for word in words_transformed:
    #	dump(word)
    #	print
    #	dump(words_clean[word])
    #	print
    #	print
    return (words_transformed, words_clean, words_freq)
Ejemplo n.º 2
0
def makeDict(fileName):
    words_transformed = []
    words_clean = {}
    words_freq = {}
    n = 0
    mine = codecs.open(fileName, 'r', 'utf-8')
    for line in mine:
        # wordsInLine = line.split()
        # print len(wordsInLine)
        # for word in wordsInLine:
        (word, freq) = line.split(',')
        # print len(wordsInLine)
        # for word in wordsInLine:
        freq = int(freq.rstrip('\r\n'))
        word = preprocess_word(word.rstrip('\n\r'+r'\x11'))
        thisWord = leven.transIn(word)
        words_transformed.append(thisWord)
        words_clean[thisWord] = word
        words_freq[thisWord] = freq
    # print "dictionary: ",len(words_transformed), " words."
    # for word in words_transformed:
    #	dump(word)
    #	print
    #	dump(words_clean[word])
    #	print
    #	print
    return (words_transformed, words_clean, words_freq)
Ejemplo n.º 3
0
def getCloseWords(wordIn,
                  word_dicts,
                  rules,
                  max_weight,
                  threshold=3,
                  fast=True,
                  debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
        print
        print "getCloseWords for", wordInTrans.encode(
            'utf-8'), "(", wordIn.encode('utf-8'), ")"
        dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
        for word in dict_words:
            # print u"*****" + words_clean[n]
            # print "word into comparison:"
            # print dump(word)
            lev_distance = Levenshtein.distance(
                wordInTrans, word
            )  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
            # print "distance: ",
            # print ratio
            if lev_distance <= threshold:
                edits = Levenshtein.editops(wordInTrans, word)
                w = weight_for_leven_edits(wordInTrans,
                                           word,
                                           edits,
                                           rules,
                                           max_weight,
                                           debug=False)
                output_words.append(
                    (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
                if (lev_distance == 0) and (fast == True):
                    # In the case of an exact match, cut the search short
                    # We might have got some close matches ahead of time, so this
                    # will not create a complete list
                    output_words = sorted(
                        output_words,
                        key=lambda word: int(words_freq[word[0]]))
                    return sorted(output_words, key=lambda word: int(word[3]))
            n = n + 1
    return sorted(output_words, key=lambda word: word[3])
Ejemplo n.º 4
0
def getCloseWords(wordIn, word_dicts, rules, max_weight, threshold=3, fast=True, debug=False):
    import Levenshtein
    # out = difflib.get_close_matches('ἐστιν',words)
    (dict_words, words_clean, words_freq) = word_dicts
    # print "word in:"
    # print dump(wordIn)
    # wordIn = preprocess_word(wordIn)
    # print "word in pp:"
    # print dump(wordIn)
    wordInTrans = leven.transIn(wordIn)
    if (debug):
      print
      print "getCloseWords for", wordInTrans.encode('utf-8'), "(", wordIn.encode('utf-8'),")"
      dump(wordIn)
    output_words = []
    #dict_words_set = set(dict_words)
    n = 0
    # print "Now comparing to..."
    if wordInTrans in dict_words:
        pass
    #    print "short-circuting dictionary word"
    #    output_words.append((wordInTrans,0,0,0,'xxx','yyy'))
    else:
      for word in dict_words:
          # print u"*****" + words_clean[n]
          # print "word into comparison:"
          # print dump(word)
          lev_distance = Levenshtein.distance(
              wordInTrans, word)  # difflib.SequenceMatcher(None, word, wordInTrans).ratio()
          # print "distance: ",
          # print ratio
          if lev_distance <= threshold:
              edits = Levenshtein.editops(wordInTrans, word)
              w = weight_for_leven_edits(wordInTrans, word, edits, rules, max_weight, debug=False)
              output_words.append(
                  (word, lev_distance, len(edits), w, 'xxx', 'yyy'))
              if (lev_distance == 0) and (fast == True):
                  # In the case of an exact match, cut the search short
                  # We might have got some close matches ahead of time, so this
                  # will not create a complete list
                  output_words = sorted(
                      output_words, key=lambda word: int(words_freq[word[0]]))
                  return sorted(output_words, key=lambda word: int(word[3]))
          n = n + 1
    return sorted(output_words, key=lambda word: word[3])