class SoundexTest(unittest.TestCase): def setUp(self): super(SoundexTest, self).setUp() self.instance = Soundex() def test_soundex(self): '''TEST: Soundex calculation''' self.assertEqual(self.instance.soundex('vasudev'), 'v231') self.assertEqual(self.instance.soundex('Rupert'), 'R163') self.assertEqual(self.instance.soundex(u'ಬೆಂಗಳೂರು'), u'ಬDNFQCPC') self.assertEqual(self.instance.soundex(u'बॆंगळूरु'), u'बDNFQCPC') self.assertEqual(self.instance.soundex(u'आम्र् फल्'), u'आNPMQ000') def test_compare(self): '''TEST: Soundex Comparison''' self.assertEqual(self.instance.compare('Bangalore', u'ಬೆಂಗಳೂರು'), -1) self.assertEqual(self.instance.compare(u'ಬೆಂಗಳೂರು', u'बॆंगळूरु'), 2) self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'बॆंगळूरु'), 0) self.assertEqual(self.instance.compare(u'बॆंगळूरु', u'आम्र् फल्'), -1)
class FeatureExtractor(BaseEstimator, TransformerMixin): """Extract review text, emojis and emoji sentiment. Takes a sequence of strings and produces a dict of values. Keys are `review`, `emojis`, and `emoji-sentiment`. """ def __init__(self, lang='ta'): self.lang = lang self.normalizer = BaseNormalizer(lang) # This language map was created using Google's googletrans module. Create the file alltextlang.txt by calling # detect_lang_and_store in feature_utils.py self.lmap = self.load_language_maps( os.path.join(os.path.dirname(sys.path[0]), '../resources/data/alltextslang.txt')) self.soundexer = Soundex() self.ta_trans = Transliterator(source='eng', target='tam', build_lookup=True) self.ml_trans = Transliterator(source='eng', target='mal', build_lookup=True) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary( '../../src/extern/data/etymdict.csv.vocab.tsv.gz', term_index=0, count_index=1, separator="\t") super().__init__() def load_language_maps(self, mapfile): lmap = {} with open(mapfile, 'r') as mapf: for line in mapf: text, lang, conf = line.rstrip().split('\t') lmap[text] = (lang, float(conf)) return lmap def get_language_tag(self, text): return self.lmap.get(text, ('unknown', 0.0)) def fit(self, x, y=None): return self def transform(self, reviews): features = np.recarray( shape=(len(reviews), ), dtype=[ ('review', object), ('emojis', object), ('emoji_sentiment', object), ('lang_tag', object), ('len_range', object), ('soundexes', object), ], ) for i, review in enumerate(reviews): features['review'][i] = self.normalizer.normalize(text=review) emojis, sentiment = get_emojis_from_text(review) features['emojis'][i] = ' '.join(emojis) features['emoji_sentiment'][i] = sentiment lang, conf = self.get_language_tag(review.strip()) if lang == self.lang or lang == (self.lang + 'en'): # google agrees with some confidence agreement = 1 elif conf < 0.5: # google says not-tamil, but weakly agreement = 0.5 else: # google clearly says not-tamil agreement = 0 features['lang_tag'][i] = {'lang': lang, 'agreement': agreement} features['len_range'][i] = get_doc_len_range(review) if self.lang == 'ta': review_trans = self.ta_trans.transform(review) for word in review_trans.split(): suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) if len(suggestions) > 0 and suggestions[0].distance < 3: print(word, suggestions[0].term) # no match with dictionary, we need a more comprehensive dictionary plus phonetic similarity elif self.lang == 'ml': review_trans = self.ml_trans.transform(review) else: review_trans = review # TODO: introduce spell correct here for added normalisation # print(lang, review_trans) features['soundexes'][i] = ' '.join([ self.soundexer.soundex(word) for word in review_trans.split() ]) return features
pass if args.mode: pass if args.verbose: pass #---------------------------variables globales NB_CORR = 5 INSTANCE = Soundex() WORDS = Counter(vocab) WORDS_SET = set(w for w in WORDS) args.distance = "dlv" WORDS_SOUND = {} for word, value in WORDS.items(): code = INSTANCE.soundex(word) WORDS_SOUND.setdefault(code, []).append(word) if input_stream: INPUT_DATA = data if args.train: TRAIN_DATA = [x for x, y in train][0:100] if args.test: TEST_DATA = test #-------------------------- Spelling Corrector def measure_distance(word1, word2, distance_type): if distance_type == 'lv': distance = Levenshtein.eval(word1, word2)