Beispiel #1
0
    def _scored_analyses(self, word_dd, prediction):
        bert_analysis = prediction
        analyses = self._analyzer.analyze(word_dd)

        if len(analyses) == 0:
            # If the word is not found in the analyzer,
            # return the predictions from BERT
            return [ScoredAnalysis(0, bert_analysis)]

        scored = [(self._scorer(a,
                                bert_analysis,
                                self._mle,
                                tie_breaker=self._tie_breaker,
                                features=self._features), a) for a in analyses]
        scored.sort(key=lambda s: (-s[0], s[1]['diac']))

        max_score = max(s[0] for s in scored)

        if max_score != 0:
            scored_analyses = [
                ScoredAnalysis(s[0] / max_score, s[1]) for s in scored
            ]
        else:
            # If the max score is 0, do not divide
            scored_analyses = [ScoredAnalysis(0, s[1]) for s in scored]

        return scored_analyses[:self._top]
Beispiel #2
0
    def disambiguate_word(self, sentence, word_ndx, top=1):
        word = sentence[word_ndx]
        word_dd = dediac_ar(word)

        if self._mle is not None and word_dd in self._mle:
            analyses = [ScoredAnalysis(1.0, self._mle[word_dd])]
            return DisambiguatedWord(word, analyses)

        else:
            analyses = self._analyzer.analyze(word_dd)

            if len(analyses) == 0:
                return DisambiguatedWord(word, [])

            probabilities = [10**_get_pos_lex_freq(a) for a in analyses]
            max_prob = max(probabilities)

            scored_analyses = [
                ScoredAnalysis(p / max_prob, a)
                for a, p in zip(analyses, probabilities)
            ]

            scored_analyses.sort(key=lambda w: w.analysis['diac'])
            scored_analyses.sort(key=lambda w: len(w.analysis['bw']))
            scored_analyses.sort(key=lambda w: w.score, reverse=True)

            if top < 1:
                return DisambiguatedWord(word, scored_analyses)
            else:
                return DisambiguatedWord(word, scored_analyses[0:top])
Beispiel #3
0
    def _scored_analyses(self, word_dd):
        if self._mle is not None and word_dd in self._mle:
            mle_analysis = self._mle[word_dd]
            analyses = self._analyzer.analyze(word_dd)

            if len(analyses) == 0:
                return []

            scored = [(_score_analysis(a, mle_analysis), a) for a in analyses]
            scored.sort(key=lambda s: (-s[0], len(s[1]['bw']), s[1]['diac']))

            max_score = max([s[0] for s in scored])

            scored_analyses = [ScoredAnalysis(s[0] / max_score, s[1])
                               for s in scored]

            return scored_analyses[0:self._top]

        else:
            analyses = self._analyzer.analyze(word_dd)

            if len(analyses) == 0:
                return []

            probabilities = [10 ** _get_pos_lex_logprob(a) for a in analyses]
            max_prob = max(probabilities)

            scored_analyses = [ScoredAnalysis(p / max_prob, a)
                               for a, p in zip(analyses, probabilities)]
            scored_analyses.sort(key=lambda w: (-w.score,
                                                len(w.analysis['bw']),
                                                w.analysis['diac']))

            return scored_analyses[0:self._top]