def add_definition(self, term: str, definition: str) -> 'Dictionary': # tokenize term_words = list(unicode_tokenize(' '.join(term.strip().split()))) def_words = list(unicode_tokenize(' '.join( definition.strip().split()))) term_word_indices = tuple( self._resolve_word_index(token) for token in term_words) def_word_indices = tuple( self._resolve_word_index(token) for token in def_words) # add definition _idx = len(self.terms) self.terms.append(term_word_indices) self.definitions.append(def_word_indices) # add to index for word_index in term_word_indices: if word_index > 0: self._term_indices[word_index].add(_idx) for word_index in def_word_indices: if word_index > 0: self._def_indices[word_index].add(_idx) # allow operator chaining return self
def lookup_terms(self, text): dict_indices = set() for word in unicode_tokenize(' '.join( text.strip().casefold().split())): dict_indices.update(self._casefold_index.get(word, set())) out = [] for dict_index in dict_indices: out.extend(self.dictionaries[dict_index].lookup_terms(text)) return sorted(out, key=lambda x: x[-1], reverse=True)
def lookup_definitions(self, text): matches = Counter() for word in unicode_tokenize(' '.join( text.strip().casefold().split())): for word_index in self._casefold_indices.get(word, set()): matches.update(self._def_indices[word_index]) out = [] for match_index, count in matches.most_common(): out.append( (''.join(self.vocabulary[idx] for idx in self.terms[match_index]), ''.join(self.vocabulary[idx] for idx in self.definitions[match_index]), count)) return out
def detect_language( text: str, language_codes: Optional[Iterable[str]] = None ) -> List[Tuple[str, float]]: language_codes = set(check_languages(language_codes, SUPPORTED_LANGUAGES)) scores = dict() cumulative = 0 for word in unicode_tokenize(text): unseen = set(scores.keys()) min_score = 0 for lang, prob in detect_word(word): if lang in scores: scores[lang] += math.log2(prob) unseen.remove(lang) else: scores[lang] = cumulative + math.log2(prob) min_score = min(min_score, math.log2(prob)) for lang in unseen: scores[lang] += min_score - math.log2(len(language_codes)) cumulative += min_score - math.log2(len(language_codes)) if not scores: return [] max_score = max(scores.values()) probs = Counter() total = 0 for lang, score in scores.items(): prob = 2**(score - max_score) total += prob probs[lang] = prob return [(lang, prob / total) for lang, prob in probs.most_common()]
return out if __name__ == '__main__': with open('translate-reference.txt') as f: ref_lines = f.readlines() with open('translate-google-offline.txt') as f: hyp_lines = f.readlines() scores_bow = [] scores_nmd = [] scores_sim = [] for ref_line, hyp_line in zip(ref_lines, hyp_lines): ref_tokens = list( unicode_tokenize(ref_line.casefold(), words_only=True, merge_apostrophe_word=True)) hyp_tokens = list( unicode_tokenize(hyp_line.casefold(), words_only=True, merge_apostrophe_word=True)) scores_bow.append( bow_ngram_movers_distance(ref_tokens, hyp_tokens, 4) / max(len(ref_tokens), len(hyp_tokens))) scores_sim.append( bow_ngram_movers_distance(ref_tokens, hyp_tokens, 4, invert=True) / max(len(ref_tokens), len(hyp_tokens))) scores_nmd.append( ngram_movers_distance(' '.join(ref_tokens), ' '.join(hyp_tokens), 4,