Esempio n. 1
0
class ExtractSentences(jsonql.Transformer):
    def __init__(
            self,
            sp_model: Path,
            lm_model: Path,
            field: str = "raw_content",
            threshold: float = float("+inf"),
    ):
        super().__init__()
        self.sp_model = sp_model
        self.lm_model = lm_model
        self.field = field
        self.threshold = threshold
        self.sp: SentencePieceProcessor = None
        self.lm: KenlmModel = None
        self.splitter: SentenceSplitter = None
        self.hashes: Set[int] = set()

    def _prepare(self):
        self.sp = SentencePieceProcessor()
        self.sp.load(str(self.sp_model))
        self.splitter = SentenceSplitter("en")
        self.lm = KenlmModel(str(self.lm_model))

    def do(self, document: dict) -> Optional[str]:
        content: Optional[str] = document.get(self.field)
        if not content:
            return None
        all_sentences = [
            s for l in content.split("\n") if l
            for s in self.splitter.split(text=l)
        ]
        unique_sentences = []
        for s in all_sentences:
            if not s:
                continue
            h = dedup.str_hash(s)
            if h in self.hashes:
                continue
            self.hashes.add(h)
            unique_sentences.append(s)

        scores = []
        for sentence in unique_sentences:
            normalized = text_normalizer.normalize(sentence)
            pieces = self.sp.encode_as_pieces(normalized)
            log_score = self.lm.score(" ".join(pieces))
            pp = -1
            if len(pieces):
                pp = perplexity.pp(log_score, len(pieces))
            scores.append(pp)

        res = filter(lambda pp_s: self.threshold > pp_s[0] > 0,
                     zip(scores, unique_sentences))
        return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
Esempio n. 2
0
class KenLMFeatureGenerator(LanguageFeatureGenerator):
    '''
    Provide ngram features by querying language model via KenLM python wrapper
    @param model: the filename of the compact language model to be loaded
    @type model: str
    @param language: the language that this model has been trained on
    @type language: str
    '''
    
    feature_names = ['kenlm_unk_pos_abs_avg',
                     'kenlm_unk_pos_abs_std',
                     'kenlm_unk_pos_abs_min',
                     'kenlm_unk_pos_abs_max',
                     'kenlm_unk_pos_rel_avg',
                     'kenlm_unk_pos_rel_std',
                     'kenlm_unk_pos_rel_min',
                     'kenlm_unk_pos_rel_max',
                     'kenlm_unk',
                     'kenlm_unk_len',
                     'kenlm_length_avg',
                     'kenlm_length_std',
                     'kenlm_length_min',
                     'kenlm_length_max',
                     'kenlm_probs_avg',
                     'kenlm_probs_std',
                     'kenlm_probs_min',
                     'kenlm_probs_max',                       
                     'kenlm_probs_pos_max',
                     'kenlm_probs_pos_min',
                     'kenlm_probs_low',
                     'kenlm_probs_high',
                     'kenlm_probs_low_pos_avg',
                     'kenlm_probs_low_pos_std',
                     'kenlm_prob',
                     
                     'lm_unk_pos_abs_avg',
                     'lm_unk_pos_abs_std',
                     'lm_unk_pos_abs_min',
                     'lm_unk_pos_abs_max',
                     'lm_unk_pos_rel_avg',
                     'lm_unk_pos_rel_std',
                     'lm_unk_pos_rel_min',
                     'lm_unk_pos_rel_max',
                     'lm_unk',
                     'lm_unk_len',
                     'lm_length_avg',
                     'lm_length_std',
                     'lm_length_min',
                     'lm_length_max',
                     'lm_probs_avg',
                     'lm_probs_std',
                     'lm_probs_min',
                     'lm_probs_max',                       
                     'lm_probs_pos_max',
                     'lm_probs_pos_min',
                     'lm_probs_low',
                     'lm_probs_high',
                     'lm_probs_low_pos_avg',
                     'lm_probs_low_pos_std',
                     'lm_prob',
                     
                     
                     ]
    
    def __init__(self, language=None, model=None, bos=True, eos=True, **kwargs):
        '''
        Load the model
        '''
        self.model = Model(model)
        self.language = language
        self.bos = bos
        self.eos = eos
        
    def get_features_string(self, string):
        total_score = self.model.score(string, bos=self.bos, eos=self.eos)
        partial_scores = self.model.full_scores(string, bos=self.bos, eos=self.eos)
        ngram_lengths = []
        probs = []
        unk_count = 0
        unk_pos = []
        unk_tokens = []
        tokens = string.split()
        tokens_iter = iter(tokens)
        pos = 0
        for pos, (prob, ngram_length, wid) in enumerate(partial_scores):
            try:
                token = next(tokens_iter)
            #End of sentence score has no token
            except StopIteration:             
                token = ""
            if wid:
                unk_count += 1
                unk_pos.append(pos)
                unk_tokens.append(token)
                
            ngram_lengths.append(ngram_length)
            probs.append(prob)
            pos += 1
            
        unk_rel_pos = [(unk_pos_item * 1.00) / len(tokens) for unk_pos_item in unk_pos]
        unk_len = sum([len(token) for token in unk_tokens])
        
        if len(unk_pos) == 0:
            unk_pos = [0]
            unk_rel_pos = [0]    
        
        features = { 'kenlm_unk_pos_abs_avg' : average(unk_pos),
                       'kenlm_unk_pos_abs_std' : std(unk_pos),
                       'kenlm_unk_pos_abs_min' : min(unk_pos),
                       'kenlm_unk_pos_abs_max' : max(unk_pos),
                       'kenlm_unk_pos_rel_avg' : average(unk_rel_pos),
                       'kenlm_unk_pos_rel_std' : std(unk_rel_pos),
                       'kenlm_unk_pos_rel_min' : min(unk_rel_pos),
                       'kenlm_unk_pos_rel_max' : max(unk_rel_pos),
                       'kenlm_unk' : unk_count,
                       'kenlm_unk_len' : unk_len,
                       'kenlm_length_avg' : average(ngram_lengths),
                       'kenlm_length_std' : std(ngram_lengths),
                       'kenlm_length_min' : min(ngram_lengths),
                       'kenlm_length_max' : max(ngram_lengths),
                       'kenlm_probs_avg' : average(probs),
                       'kenlm_probs_std' : std(probs),
                       'kenlm_probs_min' : min(probs),
                       'kenlm_probs_max' : max(probs),                       
                       'kenlm_probs_pos_max' : probs.index(max(probs)),
                       'kenlm_probs_pos_min' : probs.index(min(probs)),
                       'kenlm_probs_low' : self._standouts(probs, -1),
                       'kenlm_probs_high' : self._standouts(probs, +1),
                       'kenlm_probs_low_pos_avg': average(self._standout_pos(probs, -1)),
                       'kenlm_probs_low_pos_std': std(self._standout_pos(probs, -1)),
                       'kenlm_prob' : total_score,
                       
                       
                       'lm_unk_pos_abs_avg' : average(unk_pos),
                       'lm_unk_pos_abs_std' : std(unk_pos),
                       'lm_unk_pos_abs_min' : min(unk_pos),
                       'lm_unk_pos_abs_max' : max(unk_pos),
                       'lm_unk_pos_rel_avg' : average(unk_rel_pos),
                       'lm_unk_pos_rel_std' : std(unk_rel_pos),
                       'lm_unk_pos_rel_min' : min(unk_rel_pos),
                       'lm_unk_pos_rel_max' : max(unk_rel_pos),
                       'lm_unk' : unk_count,
                       'lm_unk_len' : unk_len,
                       'lm_length_avg' : average(ngram_lengths),
                       'lm_length_std' : std(ngram_lengths),
                       'lm_length_min' : min(ngram_lengths),
                       'lm_length_max' : max(ngram_lengths),
                       'lm_probs_avg' : average(probs),
                       'lm_probs_std' : std(probs),
                       'lm_probs_min' : min(probs),
                       'lm_probs_max' : max(probs),                       
                       'lm_probs_pos_max' : probs.index(max(probs)),
                       'lm_probs_pos_min' : probs.index(min(probs)),
                       'lm_probs_low' : self._standouts(probs, -1),
                       'lm_probs_high' : self._standouts(probs, +1),
                       'lm_probs_low_pos_avg': average(self._standout_pos(probs, -1)),
                       'lm_probs_low_pos_std': std(self._standout_pos(probs, -1)),
                       'lm_prob' : total_score,
                        
                        
                        }
        
        return features

    def _standouts(self, vector, sign):
        std_value = std(vector)
        avg_value = average(vector)
        standout = 0
        
        for value in vector:
            if value*sign > (avg_value + sign*std_value):
                standout += 1
            
        return standout
    
    def _standout_pos(self, vector, sign):
        std_value = std(vector)
        avg_value = average(vector)
        standout = []
        for pos, value in enumerate(vector, start=1):
            if value*sign > (avg_value + sign*std_value):
                standout.append(pos)
            
        return standout
Esempio n. 3
0
class KenLMScorer(Scorer):

    name = "kenlm"

    def __init__(self, model=None, path=None, nlp=None, lowercase=True):

        if model:
            self.model = model
        elif path:
            self.model = KenLMModel(path)

        self._check_model()

        if nlp:
            self.nlp = nlp
        else:
            import spacy

            self.nlp = spacy.load("en_core_web_sm")

        self.lowercase = lowercase

    def _check_model(self):
        assert isinstance(self.model, KenLMModel)
        assert self.model.score("testing !") < 0

    def preprocess(self, segment):
        """
        SpaCy tokenize + lowercase. Ignore extra whitespaces.
        - if Doc, Span, Token - retrieve .lower_
        - if string - convert to Doc first
        """
        if isinstance(segment, (Doc, Span, Token)):
            # spaCy tokenizer, ignore whitespaces
            tok = [token.text for token in segment if not token.is_space]
            if self.lowercase:
                tok = [token.lower() for token in tok]

        elif isinstance(segment, str):
            doc = self.nlp(segment, disable=self.nlp.pipe_names)
            return self.preprocess(doc)

        return " ".join(tok)

    def __call__(self, segment, score_type="perplexity"):

        text = self.preprocess(segment)
        word_count = len(text.split())

        if word_count < 2:
            warnings.warn(f"Scorer: Received {word_count} tokens, expected >= 2.")
            return float("-inf")

        if isinstance(segment, Doc):
            # if doc - assume bos, eos=True
            bos = True
            eos = True

        if isinstance(segment, (Span, Token)):
            # if span - assume bos, eos=False
            bos = False
            eos = False

        if isinstance(segment, str):
            # string passed - guess:
            bos = text.capitalize() == text
            eos = text[-1] in string.punctuation

        # log10 prob
        score = self.model.score(text, bos=bos, eos=eos)

        if score_type == "log":
            return score

        elif score_type == "perplexity":
            prob = 10.0 ** (score)
            return prob ** (-1 / word_count)

        else:
            raise NotImplementedError

    def score_suggestion(self, doc, span, suggestion):
        text = " ".join([doc[: span.start].text] + suggestion + [doc[span.end :].text])
        return self(text)

    def sort_suggestions(self, spans: List[Span]) -> List[Span]:
        for span in spans:
            if len(span._.suggestions) > 1:
                span._.suggestions = sorted(
                    span._.suggestions,
                    key=lambda x: self.score_suggestion(
                        span.doc, span, [t.text for t in x]
                    ),
                )
        return spans