def fit(self, X, y): lemmas_with_codes_and_vars = list( chain.from_iterable([(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join( self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join( self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call([ "/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G" ], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self
def __init__(self, language=None, model=None, bos=True, eos=True, **kwargs): ''' Load the model ''' self.model = Model(model) self.language = language self.bos = bos self.eos = eos
class ExtractSentences(jsonql.Transformer): def __init__( self, sp_model: Path, lm_model: Path, field: str = "raw_content", threshold: float = float("+inf"), ): super().__init__() self.sp_model = sp_model self.lm_model = lm_model self.field = field self.threshold = threshold self.sp: SentencePieceProcessor = None self.lm: KenlmModel = None self.splitter: SentenceSplitter = None self.hashes: Set[int] = set() def _prepare(self): self.sp = SentencePieceProcessor() self.sp.load(str(self.sp_model)) self.splitter = SentenceSplitter("en") self.lm = KenlmModel(str(self.lm_model)) def do(self, document: dict) -> Optional[str]: content: Optional[str] = document.get(self.field) if not content: return None all_sentences = [ s for l in content.split("\n") if l for s in self.splitter.split(text=l) ] unique_sentences = [] for s in all_sentences: if not s: continue h = dedup.str_hash(s) if h in self.hashes: continue self.hashes.add(h) unique_sentences.append(s) scores = [] for sentence in unique_sentences: normalized = text_normalizer.normalize(sentence) pieces = self.sp.encode_as_pieces(normalized) log_score = self.lm.score(" ".join(pieces)) pp = -1 if len(pieces): pp = perplexity.pp(log_score, len(pieces)) scores.append(pp) res = filter(lambda pp_s: self.threshold > pp_s[0] > 0, zip(scores, unique_sentences)) return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
def fit(self, X, y): lemmas_with_codes_and_vars = list(chain.from_iterable( [(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join(self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join(self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call(["/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G"], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self
def __init__(self, model=None, path=None, nlp=None, lowercase=True): if model: self.model = model elif path: self.model = KenLMModel(path) self._check_model() if nlp: self.nlp = nlp else: import spacy self.nlp = spacy.load("en_core_web_sm") self.lowercase = lowercase
def from_file(filename): return KenLMLanguageModel(Model(filename))
class LMParadigmClassifier(BaseEstimator, ClassifierMixin): """ Пытаемся классифицировать парадигмы с помощью языковых моделей """ def __init__(self, paradigm_codes, paradigm_counts, lm_order=3, lm_type="kenlm", multiclass=False, tmp_folder="saved_models"): self.paradigm_codes = paradigm_codes self.paradigm_counts = paradigm_counts self.lm_order = lm_order self.lm_type = lm_type self.tmp_folder = tmp_folder self.multiclass = multiclass self.lm = None self.filename_count = 1 self._initialize() def _initialize(self): self.transformations_handler = TransformationsHandler( self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder) def fit(self, X, y): lemmas_with_codes_and_vars = list( chain.from_iterable([(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join( self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join( self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call([ "/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G" ], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self @property def transformation_codes(self): return self.transformations_handler.transformation_codes @property def transformations_by_strings(self): return self.transformations_handler.transformations_by_strings @property def transformations(self): return self.transformations_handler.transformations def get_best_continuous_score(self, word): total_score = 0 best_variant, best_score = None, -np.inf if self.lm_type == "kenlm": state = State() self.lm.BeginSentenceWrite(state) else: history = ('<s>', ) for i, symbol in enumerate(word): prefix, suffix = word[:i], word[i:] for code in self.transformations_by_strings.get(suffix, []): if self.lm_type == "kenlm": # curr_state изменяется в функции BaseScore, поэтому копируем curr_state, out_state = copy.copy(state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore( out_state, State()) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (code, ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (i + 1) # print("{3} {0} {1} {2:.2f}".format( # " ".join(prefix), self.transformations[code], score, code)) if score > best_score: best_variant = list(prefix) + [code] best_score = score print("{3} {0} {1} {2:.2f}".format( " ".join(prefix), self.transformations[code], score, code)) if self.lm_type == "kenlm": out_state = State() score = self.lm.BaseScore(state, symbol, out_state) state = out_state elif self.lm_type == "pynlpl": score = self.lm.scoreword(symbol, history) history += (symbol, ) total_score += score curr_state, word_score = out_state, total_score for code in sorted(self.transformations_by_strings[""]): if self.lm_type == "kenlm": state, out_state = copy.copy(curr_state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, state) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (str(code), ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (len(word) + 1) # print("{0} {1} {2:.2f}".format(" ".join(word), # "#".join(self.transformations[code].trans), score)) if score > best_score: best_variant, best_score = list(word) + [code], score print("{3} {0} {1} {2:.2f}".format(" ".join(word), self.transformations[code], score, code)) answer = [] for elem in best_variant: if isinstance(elem, str): answer.append([elem] * 13) else: answer.append(self.transformations[elem].trans) print("#".join("".join(elem) for elem in zip(*answer))) def test(self): """ Тестируем интерфейс языковых моделей """ word = "лыжня" self.get_best_continuous_score(word)
def _prepare(self): self.sp = SentencePieceProcessor() self.sp.load(str(self.sp_model)) self.splitter = SentenceSplitter("en") self.lm = KenlmModel(str(self.lm_model))
class KenLMFeatureGenerator(LanguageFeatureGenerator): ''' Provide ngram features by querying language model via KenLM python wrapper @param model: the filename of the compact language model to be loaded @type model: str @param language: the language that this model has been trained on @type language: str ''' feature_names = ['kenlm_unk_pos_abs_avg', 'kenlm_unk_pos_abs_std', 'kenlm_unk_pos_abs_min', 'kenlm_unk_pos_abs_max', 'kenlm_unk_pos_rel_avg', 'kenlm_unk_pos_rel_std', 'kenlm_unk_pos_rel_min', 'kenlm_unk_pos_rel_max', 'kenlm_unk', 'kenlm_unk_len', 'kenlm_length_avg', 'kenlm_length_std', 'kenlm_length_min', 'kenlm_length_max', 'kenlm_probs_avg', 'kenlm_probs_std', 'kenlm_probs_min', 'kenlm_probs_max', 'kenlm_probs_pos_max', 'kenlm_probs_pos_min', 'kenlm_probs_low', 'kenlm_probs_high', 'kenlm_probs_low_pos_avg', 'kenlm_probs_low_pos_std', 'kenlm_prob', 'lm_unk_pos_abs_avg', 'lm_unk_pos_abs_std', 'lm_unk_pos_abs_min', 'lm_unk_pos_abs_max', 'lm_unk_pos_rel_avg', 'lm_unk_pos_rel_std', 'lm_unk_pos_rel_min', 'lm_unk_pos_rel_max', 'lm_unk', 'lm_unk_len', 'lm_length_avg', 'lm_length_std', 'lm_length_min', 'lm_length_max', 'lm_probs_avg', 'lm_probs_std', 'lm_probs_min', 'lm_probs_max', 'lm_probs_pos_max', 'lm_probs_pos_min', 'lm_probs_low', 'lm_probs_high', 'lm_probs_low_pos_avg', 'lm_probs_low_pos_std', 'lm_prob', ] def __init__(self, language=None, model=None, bos=True, eos=True, **kwargs): ''' Load the model ''' self.model = Model(model) self.language = language self.bos = bos self.eos = eos def get_features_string(self, string): total_score = self.model.score(string, bos=self.bos, eos=self.eos) partial_scores = self.model.full_scores(string, bos=self.bos, eos=self.eos) ngram_lengths = [] probs = [] unk_count = 0 unk_pos = [] unk_tokens = [] tokens = string.split() tokens_iter = iter(tokens) pos = 0 for pos, (prob, ngram_length, wid) in enumerate(partial_scores): try: token = next(tokens_iter) #End of sentence score has no token except StopIteration: token = "" if wid: unk_count += 1 unk_pos.append(pos) unk_tokens.append(token) ngram_lengths.append(ngram_length) probs.append(prob) pos += 1 unk_rel_pos = [(unk_pos_item * 1.00) / len(tokens) for unk_pos_item in unk_pos] unk_len = sum([len(token) for token in unk_tokens]) if len(unk_pos) == 0: unk_pos = [0] unk_rel_pos = [0] features = { 'kenlm_unk_pos_abs_avg' : average(unk_pos), 'kenlm_unk_pos_abs_std' : std(unk_pos), 'kenlm_unk_pos_abs_min' : min(unk_pos), 'kenlm_unk_pos_abs_max' : max(unk_pos), 'kenlm_unk_pos_rel_avg' : average(unk_rel_pos), 'kenlm_unk_pos_rel_std' : std(unk_rel_pos), 'kenlm_unk_pos_rel_min' : min(unk_rel_pos), 'kenlm_unk_pos_rel_max' : max(unk_rel_pos), 'kenlm_unk' : unk_count, 'kenlm_unk_len' : unk_len, 'kenlm_length_avg' : average(ngram_lengths), 'kenlm_length_std' : std(ngram_lengths), 'kenlm_length_min' : min(ngram_lengths), 'kenlm_length_max' : max(ngram_lengths), 'kenlm_probs_avg' : average(probs), 'kenlm_probs_std' : std(probs), 'kenlm_probs_min' : min(probs), 'kenlm_probs_max' : max(probs), 'kenlm_probs_pos_max' : probs.index(max(probs)), 'kenlm_probs_pos_min' : probs.index(min(probs)), 'kenlm_probs_low' : self._standouts(probs, -1), 'kenlm_probs_high' : self._standouts(probs, +1), 'kenlm_probs_low_pos_avg': average(self._standout_pos(probs, -1)), 'kenlm_probs_low_pos_std': std(self._standout_pos(probs, -1)), 'kenlm_prob' : total_score, 'lm_unk_pos_abs_avg' : average(unk_pos), 'lm_unk_pos_abs_std' : std(unk_pos), 'lm_unk_pos_abs_min' : min(unk_pos), 'lm_unk_pos_abs_max' : max(unk_pos), 'lm_unk_pos_rel_avg' : average(unk_rel_pos), 'lm_unk_pos_rel_std' : std(unk_rel_pos), 'lm_unk_pos_rel_min' : min(unk_rel_pos), 'lm_unk_pos_rel_max' : max(unk_rel_pos), 'lm_unk' : unk_count, 'lm_unk_len' : unk_len, 'lm_length_avg' : average(ngram_lengths), 'lm_length_std' : std(ngram_lengths), 'lm_length_min' : min(ngram_lengths), 'lm_length_max' : max(ngram_lengths), 'lm_probs_avg' : average(probs), 'lm_probs_std' : std(probs), 'lm_probs_min' : min(probs), 'lm_probs_max' : max(probs), 'lm_probs_pos_max' : probs.index(max(probs)), 'lm_probs_pos_min' : probs.index(min(probs)), 'lm_probs_low' : self._standouts(probs, -1), 'lm_probs_high' : self._standouts(probs, +1), 'lm_probs_low_pos_avg': average(self._standout_pos(probs, -1)), 'lm_probs_low_pos_std': std(self._standout_pos(probs, -1)), 'lm_prob' : total_score, } return features def _standouts(self, vector, sign): std_value = std(vector) avg_value = average(vector) standout = 0 for value in vector: if value*sign > (avg_value + sign*std_value): standout += 1 return standout def _standout_pos(self, vector, sign): std_value = std(vector) avg_value = average(vector) standout = [] for pos, value in enumerate(vector, start=1): if value*sign > (avg_value + sign*std_value): standout.append(pos) return standout
class KenLMScorer(Scorer): name = "kenlm" def __init__(self, model=None, path=None, nlp=None, lowercase=True): if model: self.model = model elif path: self.model = KenLMModel(path) self._check_model() if nlp: self.nlp = nlp else: import spacy self.nlp = spacy.load("en_core_web_sm") self.lowercase = lowercase def _check_model(self): assert isinstance(self.model, KenLMModel) assert self.model.score("testing !") < 0 def preprocess(self, segment): """ SpaCy tokenize + lowercase. Ignore extra whitespaces. - if Doc, Span, Token - retrieve .lower_ - if string - convert to Doc first """ if isinstance(segment, (Doc, Span, Token)): # spaCy tokenizer, ignore whitespaces tok = [token.text for token in segment if not token.is_space] if self.lowercase: tok = [token.lower() for token in tok] elif isinstance(segment, str): doc = self.nlp(segment, disable=self.nlp.pipe_names) return self.preprocess(doc) return " ".join(tok) def __call__(self, segment, score_type="perplexity"): text = self.preprocess(segment) word_count = len(text.split()) if word_count < 2: warnings.warn(f"Scorer: Received {word_count} tokens, expected >= 2.") return float("-inf") if isinstance(segment, Doc): # if doc - assume bos, eos=True bos = True eos = True if isinstance(segment, (Span, Token)): # if span - assume bos, eos=False bos = False eos = False if isinstance(segment, str): # string passed - guess: bos = text.capitalize() == text eos = text[-1] in string.punctuation # log10 prob score = self.model.score(text, bos=bos, eos=eos) if score_type == "log": return score elif score_type == "perplexity": prob = 10.0 ** (score) return prob ** (-1 / word_count) else: raise NotImplementedError def score_suggestion(self, doc, span, suggestion): text = " ".join([doc[: span.start].text] + suggestion + [doc[span.end :].text]) return self(text) def sort_suggestions(self, spans: List[Span]) -> List[Span]: for span in spans: if len(span._.suggestions) > 1: span._.suggestions = sorted( span._.suggestions, key=lambda x: self.score_suggestion( span.doc, span, [t.text for t in x] ), ) return spans
class LMParadigmClassifier(BaseEstimator, ClassifierMixin): """ Пытаемся классифицировать парадигмы с помощью языковых моделей """ def __init__(self, paradigm_codes, paradigm_counts, lm_order=3, lm_type="kenlm", multiclass=False, tmp_folder="saved_models"): self.paradigm_codes = paradigm_codes self.paradigm_counts = paradigm_counts self.lm_order = lm_order self.lm_type = lm_type self.tmp_folder = tmp_folder self.multiclass = multiclass self.lm = None self.filename_count = 1 self._initialize() def _initialize(self): self.transformations_handler = TransformationsHandler(self.paradigm_codes, self.paradigm_counts) if not os.path.exists(self.tmp_folder): os.makedirs(self.tmp_folder) def fit(self, X, y): lemmas_with_codes_and_vars = list(chain.from_iterable( [(lemma, code, values) for code, values in label] for lemma, label in zip(X, y))) strings_for_lm_learning = \ self.transformations_handler._extract_transformations_for_lm_learning( lemmas_with_codes_and_vars) self.infile = os.path.join(self.tmp_folder, "saved_models_{0}.sav".format(self.filename_count)) with open(self.infile, "w") as fout: for seq in strings_for_lm_learning: fout.write(" ".join(map(str, seq)) + "\n") self.outfile = os.path.join(self.tmp_folder, "saved_models_{0}.arpa".format(self.filename_count)) with open(self.infile, "r") as fin, open(self.outfile, "w") as fout: subprocess.call(["/data/sorokin/tools/kenlm/bin/lmplz", "-o", str(self.lm_order), "-S", "4G"], stdin=fin, stdout=fout) if self.lm_type == "pynlpl": self.lm = ARPALanguageModel(self.outfile, base_e=False) elif self.lm_type == "kenlm": self.lm = Model(self.outfile) return self @property def transformation_codes(self): return self.transformations_handler.transformation_codes @property def transformations_by_strings(self): return self.transformations_handler.transformations_by_strings @property def transformations(self): return self.transformations_handler.transformations def get_best_continuous_score(self, word): total_score = 0 best_variant, best_score = None, -np.inf if self.lm_type == "kenlm": state = State() self.lm.BeginSentenceWrite(state) else: history = ('<s>',) for i, symbol in enumerate(word): prefix, suffix = word[:i], word[i:] for code in self.transformations_by_strings.get(suffix, []): if self.lm_type == "kenlm": # curr_state изменяется в функции BaseScore, поэтому копируем curr_state, out_state = copy.copy(state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, State()) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (code,) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (i + 1) print("{3} {0} {1} {2:.2f}".format( " ".join(prefix), self.transformations[code], score, code)) if score > best_score: best_variant = list(prefix) + [code] best_score = score # print("{3} {0} {1} {2:.2f}".format( # " ".join(prefix), self.transformations[code], score, code)) if self.lm_type == "kenlm": out_state = State() score = self.lm.BaseScore(state, symbol, out_state) state = out_state elif self.lm_type == "pynlpl": score = self.lm.scoreword(symbol, history) history += (symbol,) total_score += score curr_state, word_score = out_state, total_score for code in sorted(self.transformations_by_strings[""]): if self.lm_type == "kenlm": state, out_state = copy.copy(curr_state), State() code_score = self.lm.BaseScore(curr_state, str(code), out_state) end_score = self.lm.EndSentenceBaseScore(out_state, state) elif self.lm_type == "pynlpl": code_score = self.lm.scoreword(str(code), history) new_history = history + (str(code), ) end_score = self.lm.scoreword('</s>', new_history) score = (total_score + code_score + end_score) # / (len(word) + 1) print("{0} {1} {2:.2f}".format(" ".join(word), "#".join(self.transformations[code].trans), score)) if score > best_score: best_variant, best_score = list(word) + [code], score # print("{3} {0} {1} {2:.2f}".format( # " ".join(word), self.transformations[code], score, code)) answer = [] for elem in best_variant: if isinstance(elem, str): answer.append([elem] * 13) else: answer.append(self.transformations[elem].trans) print("#".join("".join(elem) for elem in zip(*answer))) def test(self): """ Тестируем интерфейс языковых моделей """ word = "лыжня" self.get_best_continuous_score(word)