class SoundexTest(unittest.TestCase):
    def setUp(self):
        super(SoundexTest, self).setUp()
        self.instance = Soundex()

    def test_soundex(self):
        '''TEST: Soundex calculation'''
        self.assertEqual(self.instance.soundex('vasudev'), 'v231')
        self.assertEqual(self.instance.soundex('Rupert'), 'R163')
        self.assertEqual(self.instance.soundex(u'ಬೆಂಗಳೂರು'), u'ಬDNFQCPC')
        self.assertEqual(self.instance.soundex(u'बॆंगळूरु'), u'बDNFQCPC')
        self.assertEqual(self.instance.soundex(u'आम्र् फल्'), u'आNPMQ000')

    def test_compare(self):
        '''TEST: Soundex Comparison'''
        self.assertEqual(self.instance.compare('Bangalore', u'ಬೆಂಗಳೂರು'), -1)
        self.assertEqual(self.instance.compare(u'ಬೆಂಗಳೂರು', u'बॆंगळूरु'), 2)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'बॆंगळूरु'), 0)
        self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'आम्र् फल्'), -1)
Example #2
0
class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract review text, emojis and emoji sentiment.

    Takes a sequence of strings and produces a dict of values.  Keys are
    `review`, `emojis`, and `emoji-sentiment`.
    """
    def __init__(self, lang='ta'):
        self.lang = lang
        self.normalizer = BaseNormalizer(lang)
        # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling
        # detect_lang_and_store in feature_utils.py
        self.lmap = self.load_language_maps(
            os.path.join(os.path.dirname(sys.path[0]),
                         '../resources/data/alltextslang.txt'))
        self.soundexer = Soundex()
        self.ta_trans = Transliterator(source='eng',
                                       target='tam',
                                       build_lookup=True)
        self.ml_trans = Transliterator(source='eng',
                                       target='mal',
                                       build_lookup=True)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell.load_dictionary(
            '../../src/extern/data/etymdict.csv.vocab.tsv.gz',
            term_index=0,
            count_index=1,
            separator="\t")
        super().__init__()

    def load_language_maps(self, mapfile):
        lmap = {}
        with open(mapfile, 'r') as mapf:
            for line in mapf:
                text, lang, conf = line.rstrip().split('\t')
                lmap[text] = (lang, float(conf))
        return lmap

    def get_language_tag(self, text):
        return self.lmap.get(text, ('unknown', 0.0))

    def fit(self, x, y=None):
        return self

    def transform(self, reviews):
        features = np.recarray(
            shape=(len(reviews), ),
            dtype=[
                ('review', object),
                ('emojis', object),
                ('emoji_sentiment', object),
                ('lang_tag', object),
                ('len_range', object),
                ('soundexes', object),
            ],
        )
        for i, review in enumerate(reviews):
            features['review'][i] = self.normalizer.normalize(text=review)

            emojis, sentiment = get_emojis_from_text(review)
            features['emojis'][i] = ' '.join(emojis)
            features['emoji_sentiment'][i] = sentiment

            lang, conf = self.get_language_tag(review.strip())
            if lang == self.lang or lang == (self.lang + 'en'):
                # google agrees with some confidence
                agreement = 1
            elif conf < 0.5:
                # google says not-tamil, but weakly
                agreement = 0.5
            else:
                # google clearly says not-tamil
                agreement = 0
            features['lang_tag'][i] = {'lang': lang, 'agreement': agreement}
            features['len_range'][i] = get_doc_len_range(review)
            if self.lang == 'ta':
                review_trans = self.ta_trans.transform(review)
                for word in review_trans.split():
                    suggestions = self.sym_spell.lookup(word,
                                                        Verbosity.CLOSEST,
                                                        max_edit_distance=2,
                                                        include_unknown=True)
                    if len(suggestions) > 0 and suggestions[0].distance < 3:
                        print(word, suggestions[0].term)
                        # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity
            elif self.lang == 'ml':
                review_trans = self.ml_trans.transform(review)
            else:
                review_trans = review
            # TODO: introduce spell correct here for added normalisation
            # print(lang, review_trans)
            features['soundexes'][i] = ' '.join([
                self.soundexer.soundex(word) for word in review_trans.split()
            ])
        return features
    pass
if args.mode:
    pass
if args.verbose:
    pass
#---------------------------variables globales
NB_CORR = 5
INSTANCE = Soundex()
WORDS = Counter(vocab)
WORDS_SET = set(w for w in WORDS)
args.distance = "dlv"


WORDS_SOUND = {}
for word, value in WORDS.items():
    code = INSTANCE.soundex(word)
    WORDS_SOUND.setdefault(code, []).append(word)


if input_stream:
    INPUT_DATA = data
if args.train:
    TRAIN_DATA = [x for x, y in train][0:100]
if args.test:
    TEST_DATA = test
#-------------------------- Spelling Corrector


def measure_distance(word1, word2, distance_type):
    if distance_type == 'lv':
        distance = Levenshtein.eval(word1, word2)