def test_terminology_matcher(store0, terminology0): for store in terminology0.stores.all(): for unit in store.units.all(): terminology.get(unit.__class__)(unit).stem() unit = store0.units.first() matcher = terminology_matcher.get(unit.__class__)(unit) assert matcher.text == unit.source_f assert ( matcher.split(matcher.text) == re.split(u"[\W]+", matcher.text)) assert ( matcher.tokens == [t.lower() for t in matcher.split(matcher.text) if (len(t) > 2 and t not in matcher.stopwords)]) assert matcher.stems == set(matcher.stemmer(t) for t in matcher.tokens) assert ( matcher.matches == matcher.similar( matcher.terminology_units.filter( stems__root__in=matcher.stems).distinct())) unit.source_f = "on the cycle home" unit.save() matches = [] matched = [] results = matcher.terminology_units.filter( stems__root__in=matcher.stems).distinct() for result in results: target_pair = ( result.source_f.lower().strip(), result.target_f.lower().strip()) if target_pair in matched: continue similarity = matcher.comparison.similarity(result.source_f) if similarity > matcher.similarity_threshold: matches.append((similarity, result)) matched.append(target_pair) assert ( matcher.similar(results) == sorted(matches, key=lambda x: -x[0])[:matcher.max_matches]) assert ( matcher.matches == matcher.similar(results))
def get_terminology(self): """get terminology suggestions""" results = terminology_matcher.get(self.__class__)(self).matches return [m[1] for m in results]