Example #1
0
 def do(self, document: dict) -> dict:
     text = document[self.field]
     if self.normalize:
         text = text_normalizer.normalize(text)
     tokenized = self.sp.encode_as_pieces(text)
     document[self.output_field] = " ".join(tokenized)
     return document
Example #2
0
    def do(self, document: dict) -> Optional[str]:
        content: Optional[str] = document.get(self.field)
        if not content:
            return None
        all_sentences = [
            s for l in content.split("\n") if l
            for s in self.splitter.split(text=l)
        ]
        unique_sentences = []
        for s in all_sentences:
            if not s:
                continue
            h = dedup.str_hash(s)
            if h in self.hashes:
                continue
            self.hashes.add(h)
            unique_sentences.append(s)

        scores = []
        for sentence in unique_sentences:
            normalized = text_normalizer.normalize(sentence)
            pieces = self.sp.encode_as_pieces(normalized)
            log_score = self.lm.score(" ".join(pieces))
            pp = -1
            if len(pieces):
                pp = perplexity.pp(log_score, len(pieces))
            scores.append(pp)

        res = filter(lambda pp_s: self.threshold > pp_s[0] > 0,
                     zip(scores, unique_sentences))
        return "\n".join(f"{pp}\t{s}" for (pp, s) in res) or None
Example #3
0
 def do(self, text: str):
     text = text_normalizer.normalize(text,
                                      accent=self.rm_accent,
                                      case=False,
                                      numbers=False,
                                      punct=True)
     text = text_normalizer.normalize_spacing_for_tok(text,
                                                      language=self.lang)
     return self.moses.tokenize(text, return_str=True, escape=False)
Example #4
0
 def do(self, document: dict) -> Optional[dict]:
     text = document[self.field]
     if self.normalize:
         text = text_normalizer.normalize(text)
     sp = self.get_sp(document.get("language"))
     if sp is None:
         return document
     tokenized = sp.encode_as_pieces(text)
     document[self.output_field] = " ".join(tokenized)
     return document
Example #5
0
def extract_opening_text(source, n_docs: int = 10_000):
    i = 0
    for doc in jsonql.read_jsons(source):
        if not doc:
            continue

        text = doc.get("opening_text")
        if not text:
            continue

        yield text_normalizer.normalize(text)
        i += 1
        if i >= n_docs:
            break
Example #6
0
    def do(self, document: dict) -> Optional[str]:  # type: ignore
        lines = self.get_lines(document)
        model = self.get_lm(document.get("language"))
        if not lines or not model:
            return None

        sentences = []
        for line in lines:
            if self.normalize:
                line = text_normalizer.normalize(line)
            log_score = model.score(line)
            length = len(line.split()) + 1

            sentences.append(f"{pp(log_score, length)}\t{line}")

        return "\n".join(sentences)
Example #7
0
    def do(self, document: dict) -> dict:
        lines = self.get_lines(document)
        model = self.get_lm(document.get("language"))
        if not lines or not model:
            return document

        doc_log_score, doc_length = 0, 0
        for line in lines:
            if self.normalize:
                line = text_normalizer.normalize(line)
            log_score = model.score(line)
            length = len(line.split()) + 1
            doc_log_score += log_score
            doc_length += length

        document[self.output_field] = round(pp(doc_log_score, doc_length), 1)
        return document
Example #8
0
def test_numbers():
    weird = "023456789 | 0123456789"
    normalized = "000000000 | 0000000000"
    assert txt.normalize(weird, numbers=True) == normalized
    assert txt.normalize(weird, numbers=False) == weird