def idlangs(self, tokens): """ Return whether a language is present and the counts from each wordlist. @param tokens sequence of tokens to process @returns array of booleans for language presence and tuples of wordlist scores The ordering of return vectors will match the order of languages given at initialization. """ # Create scores short_scores = numpy.zeros(self.nlangs, dtype=numpy.int) long_scores = numpy.zeros(self.nlangs, dtype=numpy.int) # Count each language for token in tokens: for idx in range(self.nlangs): if token in self.shorts[idx]: short_scores[idx] += 1 if token in self.longs[idx]: long_scores[idx] += 1 # Decide whether each language is there langspresent = [(short_scores[idx] > 1 or (short_scores[idx] == 1 and long_scores[idx] > 0)) for idx in range(self.nlangs)] # Codeswitching verdict cs = cs_langspresent(langspresent) # Give the number of hits in the wordlists hits = zip(short_scores, long_scores) lid = self._pick_lang(hits) return (lid, langspresent, hits, cs)
def idlangs(self, tokens): """ Return whether a language is present and the counts from each wordlist. @param tokens: tokens to identify """ # Per-token ratios and langs ratios = [self._ratios.get(token, RatioListLID.UNK_WORD_RATIO) for token in tokens] langs = [self._ratio_lang(ratio) if not non_lid(token) else None for ratio, token in zip(ratios, tokens)] # Count hits, making a copy with no UNKNOWN_LANG as well hits = [langs.count(lang) for lang in self.langs] known_lang_hits = hits[:-1] unknown_hits = hits[-1] hitcount = sum(hits) unk_rate = unknown_hits / hitcount if hitcount else 1.0 langspresent = [(langhits >= present_min) for langhits, present_min in zip(known_lang_hits, self.present_mins)] # Zero out langspresent based on unknown rate langspresent[0] = langspresent[0] and (unk_rate <= self.lang1_max_unk_rate) langspresent[1] = langspresent[1] and (unk_rate <= self.lang2_max_unk_rate) # If we're under the acceptable unknown rate, we can have codeswitching cs = cs_langspresent(langspresent) if (unk_rate <= self.cs_max_unk_rate) else False # Compute LID based on the greatest number of hits that passed thresholds lid = self._pick_lang([hit if present else 0 for hit, present in zip(known_lang_hits, langspresent)]) return (lid, langspresent, hits, ratios, langs, unk_rate, cs)
def idlangs(self, tokens, lowmethod, unkmethod, tags=None): """ Return whether a language is present and the counts from each wordlist. @param tokens: tokens to identify @param tags: optional Jerboa tags for the tokens """ # Per-token ratios and langs ratios = [self._ratios.get(token, RatioListLID.UNK_WORD_RATIO) for token in tokens] langs = [self._ratio_lang(ratio) if not non_lid(token) else None for ratio, token in zip(ratios, tokens)] # Put in dummy tags if needed if not tags: tags = [JERBOA_NOTAG] * len(tokens) # Choose langs for langs = [choose_lang(token, lang, self.langs, tag, ratio, lowmethod, unkmethod, False) for token, tag, lang, ratio in zip(tokens, tags, langs, ratios)] # Clean out any remaining unknowns if None in langs: langs = choose_unk_lang(langs, unkmethod) # Count hits, making a copy with no UNKNOWN_LANG as well hits = [langs.count(lang) for lang in self.langs] known_lang_hits = hits[:-1] unknown_hits = hits[-1] hitcount = sum(hits) unk_rate = unknown_hits / hitcount if hitcount else 1.0 langspresent = [(langhits >= present_min) for langhits, present_min in zip(known_lang_hits, self.present_mins)] # Zero out langspresent based on unknown rate langspresent[0] = langspresent[0] and (unk_rate <= self.lang1_max_unk_rate) langspresent[1] = langspresent[1] and (unk_rate <= self.lang2_max_unk_rate) # If we're under the acceptable unknown rate, we can have codeswitching cs = cs_langspresent(langspresent) if (unk_rate <= self.cs_max_unk_rate) else False # Compute LID based on the greatest number of hits that passed thresholds lid = self._pick_lang([hit if present else 0 for hit, present in zip(known_lang_hits, langspresent)]) return (lid, langspresent, hits, ratios, langs, unk_rate, cs)