Ejemplo n.º 1
0
def soundex(word1, word2):
    """
    See https://libindic.org/Soundex
    :param word1:
    :param word2:
    :return:
    """
    if words_equal(word1, word2):
        return True
    comparator = Soundex()
    # Result of 1 means sounds the same
    if comparator.compare(word1, word2) == 1:
        return True
    return False
Ejemplo n.º 2
0
 def __init__(self):
     """
     Initialize necessary resources.
     """
     self.dictionary_file = open(
         os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt'))
     self.dictionary = self.dictionary_file.readlines()
     self.dictionary_file.close()
     try:
         self.dictionary = marisa_trie.Trie(
             [x.strip().decode('utf-8') for x in self.dictionary])
     except:
         self.dictionary = marisa_trie.Trie(
             [x.strip() for x in self.dictionary])
     self.stemmer = Stemmer()
     self.inflector = inflector.Inflector(lang='ml')
     self.soundex = Soundex()
     self.syllabalizer = Syllabifier()
     self.ngrammer = Ngram()
Ejemplo n.º 3
0
 def __init__(self, lang='ta'):
     self.lang = lang
     self.normalizer = BaseNormalizer(lang)
     # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
     # detect_lang_and_store in feature_utils.py
     self.lmap = self.load_language_maps(
         os.path.join(os.path.dirname(sys.path[0]),
                      '../resources/data/alltextslang.txt'))
     self.soundexer = Soundex()
     self.ta_trans = Transliterator(source='eng',
                                    target='tam',
                                    build_lookup=True)
     self.ml_trans = Transliterator(source='eng',
                                    target='mal',
                                    build_lookup=True)
     self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                               prefix_length=7)
     self.sym_spell.load_dictionary(
         '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
         term_index=0,
         count_index=1,
         separator="\t")
     super().__init__()
Ejemplo n.º 4
0
 def __init__(self):
     """
     Initialize necessary resources.
     """
     self.dictionary_file = open(os.path.join(
         os.path.dirname(__file__), 'data/ml_rootwords.txt'))
     self.dictionary = self.dictionary_file.readlines()
     self.dictionary_file.close()
     try:
         self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8')
                                             for x in self.dictionary])
     except:
         self.dictionary = marisa_trie.Trie(
             [x.strip() for x in self.dictionary])
     self.stemmer = Stemmer()
     self.inflector = inflector.Inflector(lang='ml')
     self.soundex = Soundex()
     self.syllabalizer = Syllabifier()
     self.ngrammer = Ngram()
Ejemplo n.º 5
0
def start_lsh():

	create = True 
	sqlite_file = "/home/hkesavam/new_lid/code/gen_data/db_lid"
	conn = sqlite3.connect(sqlite_file)
	cur = conn.cursor()
	data = cur.execute("select distinct(soundex) from dev_table where count > 5")
	data = list(data)

	 

	#soundex_dict = pickle.load(open("/home/hkesavam/new_lid/code/gen_data/data_dev/dev_to_soundex.pkl", "r"))
	#data =  soundex_dict.keys()
	#data = ["e16512", "e16532", "hello", "hell"]
	lsh = MinHashLSH(threshold=0.5, num_perm=32)# Create MinHash objects
	minhashes = {}
	tot_wr_count = 0
	if create:
		for c, i in enumerate(data):
		  minhash = MinHash(num_perm=32)
		  for d in i[0]:
		    try:
		      d = d.encode("utf-8")
		      minhash.update(d)
		    except:
		      print "Enterin continue"
		      continue
		  lsh.insert(c, minhash)
		  minhashes[c] = minhash
		print len(data)
		print("Dumping")
		#cPickle.dump(new_data, open("data", "wb"), -1)
		cPickle.dump(lsh, open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "wb"), -1)
		print "Finished dumping"

	if not(create):
		data = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/data", "rb"))
		lsh = cPickle.load( open("/home/hkesavam/new_lid/code/gen_data/lsh_model", "rb"))
	hin_soundex_inst = Soundex()
	hin_trans = Transliterator(source='eng', target='hin', build_lookup=True)
	return cur, data, lsh, hin_soundex_inst, hin_trans
Ejemplo n.º 6
0
class SoundexTest(unittest.TestCase):
    def setUp(self):
        super(SoundexTest, self).setUp()
        self.instance = Soundex()

    def test_soundex(self):
        '''TEST: Soundex calculation'''
        self.assertEqual(self.instance.soundex('vasudev'), 'v231')
        self.assertEqual(self.instance.soundex('Rupert'), 'R163')
        self.assertEqual(self.instance.soundex(u'ಬೆಂಗಳೂರು'), u'ಬDNFQCPC')
        self.assertEqual(self.instance.soundex(u'बॆंगळूरु'), u'बDNFQCPC')
        self.assertEqual(self.instance.soundex(u'आम्र् फल्'), u'आNPMQ000')

    def test_compare(self):
        '''TEST: Soundex Comparison'''
        self.assertEqual(self.instance.compare('Bangalore', u'ಬೆಂಗಳೂರು'), -1)
        self.assertEqual(self.instance.compare(u'ಬೆಂಗಳೂರು', u'बॆंगळूरु'), 2)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'बॆंगळूरु'), 0)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'आम्र् फल्'), -1)
Ejemplo n.º 7
0
class BaseMalayalam:
    """
    Malayalam Spell Checker class.
    """

    Suggestion = namedtuple('Suggestion', 'word sound lev jac weight tag_list')

    def __init__(self):
        """
        Initialize necessary resources.
        """
        self.dictionary_file = open(
            os.path.join(os.path.dirname(__file__), 'data/ml_rootwords.txt'))
        self.dictionary = self.dictionary_file.readlines()
        self.dictionary_file.close()
        try:
            self.dictionary = marisa_trie.Trie(
                [x.strip().decode('utf-8') for x in self.dictionary])
        except:
            self.dictionary = marisa_trie.Trie(
                [x.strip() for x in self.dictionary])
        self.stemmer = Stemmer()
        self.inflector = inflector.Inflector(lang='ml')
        self.soundex = Soundex()
        self.syllabalizer = Syllabifier()
        self.ngrammer = Ngram()

    def check(self, word):
        """
        Returns if a word is spelled correctly or not.
        """
        root_word = self.stemmer.stem(word)[word]['stem']
        if root_word in self.dictionary:
            return True
        else:
            return False

    def get_best_intermediate(self, word, input_word, intermediate_words,
                              original_tag_list):
        """
        Return the best intermediate form from those generated during stemming.
        Best intermediate term is the one for which maximum similarity is
        found. It is used to handle incorrect words getting unnecessarily
        stemmed as they are not present in dictionary.
        """
        lev = []
        sound = []
        jac = []
        weight = []
        word_tags_map = {}
        selected_word = input_word
        highest_weight = 0
        for intr_counter in range(len(intermediate_words)):
            intermediate_word = intermediate_words[intr_counter]
            lev_tmp, sound_tmp, jac_tmp, weight_tmp = self.compare(
                intermediate_word, word)
            lev.append(lev_tmp)
            sound.append(sound_tmp)
            jac.append(jac_tmp)
            weight.append(weight_tmp)
            word_tags_map[intermediate_word] = original_tag_list[:intr_counter]
        if len(weight) > 0:
            highest_weight = max(weight)
            position = weight.index(highest_weight)
            selected_word = intermediate_words[position]
            lev = lev[position]
        return word_tags_map, highest_weight, selected_word

    def get_unique(self, list_of_items):
        result = []
        for item in list_of_items:
            if item not in result:
                result.append(item)
        return result

    def suggest(self, input_word, n=5):
        """
        Returns n suggestions that is similar to word.
        """
        stemmer_result = self.stemmer.stem(input_word)[input_word]
        input_word = stemmer_result['stem']
        tag_list = stemmer_result['inflection']
        first_char = input_word[0]
        if first_char == _characters[0]:
            prev_char = first_char
        else:
            prev_char_pos = _characters.index(first_char) - 1
            prev_char = _characters[prev_char_pos]
        if first_char == _characters[-1]:
            next_char = first_char
        else:
            next_char_pos = _characters.index(first_char) + 1
            next_char = _characters[next_char_pos]
        possible_words = self.dictionary.keys(first_char) +\
            self.dictionary.keys(next_char) +\
            self.dictionary.keys(prev_char)
        final = []
        intermediate_words = []
        original_tag_list = tag_list
        intermediate_words.append(input_word)
        for tag_counter in range(len(tag_list)):
            new_word = self.inflector.inflect(input_word,
                                              tag_list[-tag_counter - 1:])
            intermediate_words.insert(0, new_word)
        for word in possible_words:
            lev, sound, jac, weight1 = self.compare(input_word, word)
            word_tags_map, highest_weight, selected_word =\
                self.get_best_intermediate(
                    word, input_word, intermediate_words, original_tag_list)
            tag_list = original_tag_list
            if highest_weight >= weight1 and selected_word != input_word:
                tag_list = word_tags_map[selected_word]
            weight = max(weight1, highest_weight)
            suggestion_item = Malayalam.Suggestion(word, sound, lev, jac,
                                                   weight, tag_list)
            if weight > 50:
                final.append(suggestion_item)
        sorted_list = sorted(final, key=attrgetter('weight'), reverse=True)[:n]
        final_list = []
        for item in sorted_list:
            word = item.word
            tag_list = item.tag_list
            try:
                inflected_form = self.inflector.inflect(word, tag_list)
                final_list.append(inflected_form)
            except:
                final_list.append(word)
                continue
        return self.get_unique(final_list)

    def levenshtein_distance(self, tokens1, tokens2):
        """
        Takes two lists containing tokens of one word each and returns the
        levenshtein distance between them.
        """
        if len(tokens1) < len(tokens2):
            return self.levenshtein_distance(tokens2, tokens1)

        if len(tokens2) == 0:
            return len(tokens1)

        previous_row = range(len(tokens2) + 1)
        for i, c1 in enumerate(tokens1):
            current_row = [i + 1]
            for j, c2 in enumerate(tokens2):
                # j+1 instead of j since previous_row and current_row are one
                # character longer
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1  # than tokens2
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    def compare(self, word1, word2):
        """
        Returns the similarity measure between two words.
        """
        soundex_comparison = self.soundex.compare(word1, word2)
        tokens1 = self.syllabalizer.syllabify_ml(word1)
        tokens2 = self.syllabalizer.syllabify_ml(word2)
        levenshtein_distance = self.levenshtein_distance(tokens1, tokens2)
        ngram1 = self.ngrammer.letterNgram(word1, 1)
        ngram2 = self.ngrammer.letterNgram(word2, 1)
        total = ngram1 + ngram2
        union = []
        for counter in range(len(total)):
            item = total[counter]
            if item not in union:
                union.append(item)
        final = [x for x in ngram1 if x in ngram2] +\
            [x for x in ngram2 if x in ngram1]
        intersection = []
        for counter in range(len(final)):
            item = final[counter]
            if item not in intersection:
                intersection.append(item)
        jaccards = float(len(intersection)) / float(len(union))
        if soundex_comparison == 1 or soundex_comparison == 0:
            weight = 100
        elif levenshtein_distance <= 2 and jaccards > 0.5:
            weight = 75 + (1.5 * jaccards)
        elif levenshtein_distance < 5 and jaccards > 0.5:
            weight = 65 + (3 * jaccards)
        else:
            weight = 0
        return levenshtein_distance, soundex_comparison, jaccards, weight

    def check_and_generate(self, word):
        """
        Receives a word as input, checks if it is a valid word and returns
        the suggestions if it is not.
        Returns 0 along with suggestions if an incorrect word.
        Returns 1 along with blank list of suggestions if word in dictionary.
        Returns 2 along with blank list of suggestions if word is unique.
        """
        status = self.check(word)
        if status:
            return {'status': 1, 'suggestions': []}
        else:
            suggestions = self.suggest(word)
            if suggestions:
                return {'status': 0, 'suggestions': suggestions}
            else:
                # If there were no suggestions, it means the word was not
                # similar to any of the existing root words. So, that was not a
                # mistake, but an intended insertion. Hence, it is deemed as a
                # valid word
                return {'status': 2, 'suggestions': []}
Ejemplo n.º 8
0
#    even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
#    GNU General Public License for more details.                                                             #
#                                                                                                             #
#    You should have received a copy of the GNU General Public License along with this program.               #
#    If not, see <http://www.gnu.org/licenses/>.                                                              #
#                                                                                                             #
###############################################################################################################

from mutagen.id3 import ID3
from mutagen.mp3 import MP3
from tinytag import TinyTag
from libindic.soundex import Soundex

import src.utils.duplicateUtils as duplicateUtils

phonetic = Soundex()

####################################################################################### checktags #############
def checkTags(musicFile, songFile, logger):
    """  Used to check if the Soundex algorithm has returned a false positive.
         Returns True if the artist and title of the two songs are the same.
         Returns False if there is an error.
    """
    try:  # Tries to read tags from the music file.
        tags = TinyTag.get(musicFile)
    except Exception as e:  # Can't read tags - log as error.
        logger.error(f"ERROR : Can't read tags : {musicFile}")
        return False
    artist1 = duplicateUtils.removeThe(tags.artist)
    title1 = duplicateUtils.removeThe(tags.title)
Ejemplo n.º 9
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features
if args.test:
    test = []
    with open(args.test, 'r', encoding="utf8") as f:
        for i, line in enumerate(f):
            (right, wrong) = re.sub('\n', '', line.lstrip()).split('\t', maxsplit=1)
            test.append((right, wrong))

if args.distance:
    pass
if args.mode:
    pass
if args.verbose:
    pass
#---------------------------variables globales
NB_CORR = 5
INSTANCE = Soundex()
WORDS = Counter(vocab)
WORDS_SET = set(w for w in WORDS)
args.distance = "dlv"


WORDS_SOUND = {}
for word, value in WORDS.items():
    code = INSTANCE.soundex(word)
    WORDS_SOUND.setdefault(code, []).append(word)


if input_stream:
    INPUT_DATA = data
if args.train:
    TRAIN_DATA = [x for x, y in train][0:100]
	#X = [list(x) for x, w in zip(X, y) if len(x) > 0 and len(w) > 0] # list of lists
	#y = [list(w) for x, w in zip(X,y) if len(x) > 0 and len(w) > 0]

	
	return (X, y, z)
	

data_file = "Attention_after_encoder0.2dropout.txt"
#data_file = "AttentionDecoder_with_dropout8batches0.2dropout.txt"
data_file2 = "Transformer_output.txt"
# X and y being list of lists, each list contains characters of words
X, y, z = load_data(data_file)
a, b = load_transformer(data_file2)
print(data_file)
instance = Soundex()
lis1 = []
lis2 = []
lis3 = []
'''
for i,j,k in zip(X,y,z):
	lis1.append(instance.compare(i,j))
	lis2.append(instance.compare(i,k))
	lis3.append(instance.compare(j,k))
'''


transformer = []
for i in a:
	for j,k in zip(X,y):
		if i == k:
Ejemplo n.º 12
0
    print(
        'Please install indic-trans from git: https://github.com/libindic/indic-trans'
    )

ta_trans = Transliterator(source='eng', target='tam', build_lookup=True)
ml_trans = Transliterator(source='eng', target='mal', build_lookup=True)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each review.
MAX_SEQUENCE_LENGTH = 150
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)
soundexer = Soundex()


def load_language_maps(mapfile):
    lmap = {}
    with open(mapfile, 'r') as mapf:
        for line in mapf:
            text, lang, conf = line.rstrip().split('\t')
            lmap[text] = (lang, float(conf))
    return lmap


def get_language_tag(text):
    return lmap.get(text, ('unknown', 0.0))

Ejemplo n.º 13
0
class BaseMalayalam:
    """
    Malayalam Spell Checker class.
    """

    Suggestion = namedtuple('Suggestion', 'word sound lev jac weight tag_list')

    def __init__(self):
        """
        Initialize necessary resources.
        """
        self.dictionary_file = open(os.path.join(
            os.path.dirname(__file__), 'data/ml_rootwords.txt'))
        self.dictionary = self.dictionary_file.readlines()
        self.dictionary_file.close()
        try:
            self.dictionary = marisa_trie.Trie([x.strip().decode('utf-8')
                                                for x in self.dictionary])
        except:
            self.dictionary = marisa_trie.Trie(
                [x.strip() for x in self.dictionary])
        self.stemmer = Stemmer()
        self.inflector = inflector.Inflector(lang='ml')
        self.soundex = Soundex()
        self.syllabalizer = Syllabifier()
        self.ngrammer = Ngram()

    def check(self, word):
        """
        Returns if a word is spelled correctly or not.
        """
        root_word = self.stemmer.stem(word)[word]['stem']
        if root_word in self.dictionary:
            return True
        else:
            return False

    def get_best_intermediate(self, word, input_word,
                              intermediate_words, original_tag_list):
        """
        Return the best intermediate form from those generated during stemming.
        Best intermediate term is the one for which maximum similarity is
        found. It is used to handle incorrect words getting unnecessarily
        stemmed as they are not present in dictionary.
        """
        lev = []
        sound = []
        jac = []
        weight = []
        word_tags_map = {}
        selected_word = input_word
        highest_weight = 0
        for intr_counter in range(len(intermediate_words)):
            intermediate_word = intermediate_words[intr_counter]
            lev_tmp, sound_tmp, jac_tmp, weight_tmp = self.compare(
                intermediate_word, word)
            lev.append(lev_tmp)
            sound.append(sound_tmp)
            jac.append(jac_tmp)
            weight.append(weight_tmp)
            word_tags_map[intermediate_word] = original_tag_list[:intr_counter]
        if len(weight) > 0:
            highest_weight = max(weight)
            position = weight.index(highest_weight)
            selected_word = intermediate_words[position]
            lev = lev[position]
        return word_tags_map, highest_weight, selected_word

    def get_unique(self, list_of_items):
        result = []
        for item in list_of_items:
            if item not in result:
                result.append(item)
        return result

    def suggest(self, input_word, n=5):
        """
        Returns n suggestions that is similar to word.
        """
        stemmer_result = self.stemmer.stem(input_word)[input_word]
        input_word = stemmer_result['stem']
        tag_list = stemmer_result['inflection']
        first_char = input_word[0]
        if first_char == _characters[0]:
            prev_char = first_char
        else:
            prev_char_pos = _characters.index(first_char) - 1
            prev_char = _characters[prev_char_pos]
        if first_char == _characters[-1]:
            next_char = first_char
        else:
            next_char_pos = _characters.index(first_char) + 1
            next_char = _characters[next_char_pos]
        possible_words = self.dictionary.keys(first_char) +\
            self.dictionary.keys(next_char) +\
            self.dictionary.keys(prev_char)
        final = []
        intermediate_words = []
        original_tag_list = tag_list
        intermediate_words.append(input_word)
        for tag_counter in range(len(tag_list)):
            new_word = self.inflector.inflect(
                input_word, tag_list[-tag_counter - 1:])
            intermediate_words.insert(0, new_word)
        for word in possible_words:
            lev, sound, jac, weight1 = self.compare(input_word, word)
            word_tags_map, highest_weight, selected_word =\
                self.get_best_intermediate(
                    word, input_word, intermediate_words, original_tag_list)
            tag_list = original_tag_list
            if highest_weight >= weight1 and selected_word != input_word:
                tag_list = word_tags_map[selected_word]
            weight = max(weight1, highest_weight)
            suggestion_item = Malayalam.Suggestion(
                word, sound, lev, jac, weight, tag_list)
            if weight > 50:
                final.append(suggestion_item)
        sorted_list = sorted(final, key=attrgetter('weight'), reverse=True)[:n]
        final_list = []
        for item in sorted_list:
            word = item.word
            tag_list = item.tag_list
            try:
                inflected_form = self.inflector.inflect(word, tag_list)
                final_list.append(inflected_form)
            except:
                final_list.append(word)
                continue
        return self.get_unique(final_list)

    def levenshtein_distance(self, tokens1, tokens2):
        """
        Takes two lists containing tokens of one word each and returns the
        levenshtein distance between them.
        """
        if len(tokens1) < len(tokens2):
            return self.levenshtein_distance(tokens2, tokens1)

        if len(tokens2) == 0:
            return len(tokens1)

        previous_row = range(len(tokens2) + 1)
        for i, c1 in enumerate(tokens1):
            current_row = [i + 1]
            for j, c2 in enumerate(tokens2):
                # j+1 instead of j since previous_row and current_row are one
                # character longer
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1       # than tokens2
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    def compare(self, word1, word2):
        """
        Returns the similarity measure between two words.
        """
        soundex_comparison = self.soundex.compare(word1, word2)
        tokens1 = self.syllabalizer.syllabify_ml(word1)
        tokens2 = self.syllabalizer.syllabify_ml(word2)
        levenshtein_distance = self.levenshtein_distance(tokens1, tokens2)
        ngram1 = self.ngrammer.letterNgram(word1, 1)
        ngram2 = self.ngrammer.letterNgram(word2, 1)
        total = ngram1 + ngram2
        union = []
        for counter in range(len(total)):
            item = total[counter]
            if item not in union:
                union.append(item)
        final = [x for x in ngram1 if x in ngram2] +\
            [x for x in ngram2 if x in ngram1]
        intersection = []
        for counter in range(len(final)):
            item = final[counter]
            if item not in intersection:
                intersection.append(item)
        jaccards = float(len(intersection)) / float(len(union))
        if soundex_comparison == 1 or soundex_comparison == 0:
            weight = 100
        elif levenshtein_distance <= 2 and jaccards > 0.5:
            weight = 75 + (1.5 * jaccards)
        elif levenshtein_distance < 5 and jaccards > 0.5:
            weight = 65 + (3 * jaccards)
        else:
            weight = 0
        return levenshtein_distance, soundex_comparison, jaccards, weight

    def check_and_generate(self, word):
        """
        Receives a word as input, checks if it is a valid word and returns
        the suggestions if it is not.
        Returns 0 along with suggestions if an incorrect word.
        Returns 1 along with blank list of suggestions if word in dictionary.
        Returns 2 along with blank list of suggestions if word is unique.
        """
        status = self.check(word)
        if status:
            return {'status': 1, 'suggestions': []}
        else:
            suggestions = self.suggest(word)
            if suggestions:
                return {'status': 0, 'suggestions': suggestions}
            else:
                # If there were no suggestions, it means the word was not
                # similar to any of the existing root words. So, that was not a
                # mistake, but an intended insertion. Hence, it is deemed as a
                # valid word
                return {'status': 2, 'suggestions': []}
Ejemplo n.º 14
0
 def __init__(self):
     self.sx = Soundex()
Ejemplo n.º 15
0
class InexactSearch(object):
    """
       This class provides methods for fuzzy searching using word
       distance as well as phonetics.
    """

    def __init__(self):
        self.sx = Soundex()

    def _countCommon(self, shrtBigr, lngBigr, average):
        common = 0.0
        for indexShrt, bigr in enumerate(shrtBigr):
            if bigr in lngBigr:
                indexLng = lngBigr.index(bigr)
                if indexLng == indexShrt:
                    common += 1.0
                else:
                    dislocation = (indexLng - indexShrt) / average
                    if dislocation < 0:
                        dislocation *= -1
                    common += 1.0 - dislocation

        return common

    def _createBigram(self, string):
        bigram = []
        for i in range(1, len(string)):
            bigram.append(string[i - 1:i + 1])

        return bigram

    def bigram_average(self, str1, str2):
        """Return approximate string comparator measure (between 0.0 and 1.0)
        using bigrams.

        :param str1: string 1 for comparison
        :str1 type : str
        :param str2: string 2 for comparison
        :str2 type : str
        :returns: int score between 0.0 and 1.0

        >>> score = bigram_avearage(str1, str2)
        0.7


        Bigrams are two-character sub-strings contained in a
        string. For example, 'peter' contains the bigrams:
        pe,et,te,er.

        This routine counts the number of common bigrams and divides
        by the average number of bigrams. The resulting number is
        returned.
        """

        if str1 == str2:
            return 1

        bigr1 = self._createBigram(str1)
        bigr2 = self._createBigram(str2)

        average = (len(bigr1) + len(bigr2)) / 2.0

        common = 0.0

        if len(bigr1) < len(bigr2):  # Count using the shorter bigram list
            common = self._countCommon(bigr1, bigr2, average)
        else:
            common = self._countCommon(bigr2, bigr1, average)

        return common / average

    def compare(self, string1, string2):
        ''' Compare strings using soundex if not possible gives
        biggram avearage.

        :param str1: string 1 for comparison.
        :type str1: str.
        :param str2: string 2 for comparison
        :type str2: str.
        :returns: int score between 0.0 and 1.0

        '''
        weight = 0
        if string1 == string2:
            return 1.0

        soundex_match = self.sx.compare(string1, string2)

        if soundex_match == 1:
            weight = 0.9

        if soundex_match == 2:
            weight = 0.8

        if weight == 0:
            return self.bigram_average(string1, string2)

        return weight

    @servicemethod
    def search(self, text, key):
        '''Searches for the key in the given text. This function uses
        :method: `InexactSearch.compare` for doing approx search.

        :param text: text in which search has to be done.
        :type text: str.
        :param key: key which has to be searched
        :type key: str.
        :returns: A dictionary with words in the string as keys and
        the score against the key as the value
        '''
        key = key.strip()
        words = text.split()
        search_results = {}
        for word in words:
            word = word.strip()
            search_results[word] = self.compare(word, key)

        return search_results
Ejemplo n.º 16
0
 def setUp(self):
     super(SoundexTest, self).setUp()
     self.instance = Soundex()