Python Soundex.soundex Examples

Programming Language: Python

Namespace/Package Name: libindic.soundex

Class/Type: Soundex

Method/Function: soundex

Examples at hotexamples.com: 3

Python Soundex.soundex - 3 examples found. These are the top rated real world Python examples of libindic.soundex.Soundex.soundex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Soundex(10)

compare(5)

soundex(3)

Frequently Used Methods

Soundex (10)

compare (5)

soundex (3)

Example #1

Show file

File: test_soundex.py Project: qq431169079/JupyterNotebook

class SoundexTest(unittest.TestCase):
    def setUp(self):
        super(SoundexTest, self).setUp()
        self.instance = Soundex()

    def test_soundex(self):
        '''TEST: Soundex calculation'''
        self.assertEqual(self.instance.soundex('vasudev'), 'v231')
        self.assertEqual(self.instance.soundex('Rupert'), 'R163')
        self.assertEqual(self.instance.soundex(u'ಬೆಂಗಳೂರು'), u'ಬDNFQCPC')
        self.assertEqual(self.instance.soundex(u'बॆंगळूरु'), u'बDNFQCPC')
        self.assertEqual(self.instance.soundex(u'आम्र् फल्'), u'आNPMQ000')

    def test_compare(self):
        '''TEST: Soundex Comparison'''
        self.assertEqual(self.instance.compare('Bangalore', u'ಬೆಂಗಳೂರು'), -1)
        self.assertEqual(self.instance.compare(u'ಬೆಂಗಳೂರು', u'बॆंगळूरु'), 2)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'बॆंगळूरु'), 0)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'आम्र् फल्'), -1)

Example #2

Show file

class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features

Example #3

Show file

File: corrige.py Project: eugeniey/IFT6285_Traitement_automatique_des_langues_naturelles

    pass
if args.mode:
    pass
if args.verbose:
    pass
#---------------------------variables globales
NB_CORR = 5
INSTANCE = Soundex()
WORDS = Counter(vocab)
WORDS_SET = set(w for w in WORDS)
args.distance = "dlv"


WORDS_SOUND = {}
for word, value in WORDS.items():
    code = INSTANCE.soundex(word)
    WORDS_SOUND.setdefault(code, []).append(word)


if input_stream:
    INPUT_DATA = data
if args.train:
    TRAIN_DATA = [x for x, y in train][0:100]
if args.test:
    TEST_DATA = test
#-------------------------- Spelling Corrector


def measure_distance(word1, word2, distance_type):
    if distance_type == 'lv':
        distance = Levenshtein.eval(word1, word2)