Example #1
0
    def process(self, document):
        assert isinstance(document, str) or isinstance(document, doc.Document) or (self.config.get('pretokenized') or self.config.get('no_ssplit', False)), \
            "If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object."

        if isinstance(document, doc.Document):
            if self.config.get('pretokenized'):
                return document
            document = document.text

        if self.config.get('pretokenized'):
            raw_text, document = self.process_pre_tokenized_text(document)
            return doc.Document(document, raw_text)

        if hasattr(self, '_variant'):
            return self._variant.process(document)

        raw_text = '\n\n'.join(document) if isinstance(document, list) else document
        # set up batches
        batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
        # get dict data
        _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
                                               self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
                                               orig_text=raw_text,
                                               no_ssplit=self.config.get('no_ssplit', False))
        return doc.Document(document, raw_text)
Example #2
0
    def bulk_process(self, docs):
        """
        The tokenizer cannot use UDProcessor's sentence-level cross-document batching interface, and requires special handling.
        Essentially, this method concatenates the text of multiple documents with "\n\n", tokenizes it with the neural tokenizer,
        then splits the result into the original Documents and recovers the original character offsets.
        """
        if hasattr(self, '_variant'):
            return self._variant.bulk_process(docs)

        if self.config.get('pretokenized'):
            res = []
            for document in docs:
                raw_text, document = self.process_pre_tokenized_text(document.text)
                res.append(doc.Document(document, raw_text))
            return res

        combined_text = '\n\n'.join([thisdoc.text for thisdoc in docs])
        processed_combined = self.process(doc.Document([], text=combined_text))

        # postprocess sentences and tokens to reset back pointers and char offsets
        charoffset = 0
        sentst = senten = 0
        for thisdoc in docs:
            while senten < len(processed_combined.sentences) and processed_combined.sentences[senten].tokens[-1].end_char - charoffset <= len(thisdoc.text):
                senten += 1

            sentences = processed_combined.sentences[sentst:senten]
            thisdoc.sentences = sentences
            for sent in sentences:
                # fix doc back pointers for sentences
                sent._doc = thisdoc

                # fix char offsets for tokens and words
                for token in sent.tokens:
                    token._start_char -= charoffset
                    token._end_char -= charoffset
                    if token.words:  # not-yet-processed MWT can leave empty tokens
                        for word in token.words:
                            word._start_char -= charoffset
                            word._end_char -= charoffset

            thisdoc.num_tokens = sum(len(sent.tokens) for sent in sentences)
            thisdoc.num_words = sum(len(sent.words) for sent in sentences)
            sentst = senten

            charoffset += len(thisdoc.text) + 2

        return docs
Example #3
0
    def process(self, text):
        """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
        """
        if not isinstance(text, str):
            raise Exception("Must supply a string to the Jieba tokenizer.")
        tokens = self.nlp.cut(text, cut_all=False)

        sentences = []
        current_sentence = []
        offset = 0
        for token in tokens:
            if re.match(r'\s+', token):
                offset += len(token)
                continue

            token_entry = {
                doc.TEXT: token,
                doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}"
            }
            current_sentence.append(token_entry)
            offset += len(token)

            if token in ['。', '!', '?', '!', '?']:
                sentences.append(current_sentence)
                current_sentence = []

        if len(current_sentence) > 0:
            sentences.append(current_sentence)

        return doc.Document(sentences, text)
Example #4
0
    def process(self, document):
        """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
        """
        if isinstance(document, doc.Document):
            text = document.text
        else:
            text = document
        if not isinstance(text, str):
            raise Exception(
                "Must supply a string or Stanza Document object to the spaCy tokenizer."
            )
        spacy_doc = self.nlp(text)

        sentences = []
        for sent in spacy_doc.sents:
            tokens = []
            for tok in sent:
                token_entry = {
                    doc.TEXT:
                    tok.text,
                    doc.MISC:
                    f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}"
                }
                tokens.append(token_entry)
            sentences.append(tokens)

        # if no_ssplit is set, flatten all the sentences into one sentence
        if self.no_ssplit:
            sentences = [[t for s in sentences for t in s]]

        return doc.Document(sentences, text)
Example #5
0
    def process(self, document):
        """ Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
        """
        if isinstance(document, doc.Document):
            text = document.text
        else:
            text = document
        if not isinstance(text, str):
            raise Exception(
                "Must supply a string or Stanza Document object to the PyThaiNLP tokenizer."
            )

        sentences = []
        current_sentence = []
        offset = 0

        if self.no_ssplit:
            # skip sentence segmentation
            sent_strs = [text]
        else:
            sent_strs = self.pythai_sent_tokenize(text, engine='crfcut')
        for sent_str in sent_strs:
            for token_str in self.pythai_word_tokenize(sent_str,
                                                       engine='newmm'):
                # by default pythainlp will output whitespace as a token
                # we need to skip these tokens to be consistent with other tokenizers
                if token_str.isspace():
                    offset += len(token_str)
                    continue

                # create token entry
                token_entry = {
                    doc.TEXT:
                    token_str,
                    doc.MISC:
                    f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token_str)}"
                }
                current_sentence.append(token_entry)
                offset += len(token_str)

            # finish sentence
            sentences.append(current_sentence)
            current_sentence = []

        if len(current_sentence) > 0:
            sentences.append(current_sentence)

        return doc.Document(sentences, text)
Example #6
0
    def process(self, document):
        assert isinstance(document, str) or isinstance(document, doc.Document) or (self.config.get('pretokenized') or self.config.get('no_ssplit', False)), \
            "If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object."

        if isinstance(document, doc.Document):
            document = document.text

        if self.config.get('pretokenized'):
            raw_text, document = self.process_pre_tokenized_text(document)
        elif hasattr(self, '_variant'):
            return self._variant.process(document)
        else:
            raw_text = '\n\n'.join(document) if isinstance(document,
                                                           list) else document
            # set up batches
            if self.config.get('lang') == 'vi':
                # special processing is due for Vietnamese
                text = '\n\n'.join([x
                                    for x in raw_text.split('\n\n')]).rstrip()
                dummy_labels = '\n\n'.join(
                    ['0' * len(x) for x in text.split('\n\n')])
                data = paras_to_chunks(text, dummy_labels)
                batches = DataLoader(self.config,
                                     input_data=data,
                                     vocab=self.vocab,
                                     evaluation=True)
            else:
                batches = DataLoader(self.config,
                                     input_text=raw_text,
                                     vocab=self.vocab,
                                     evaluation=True)
            # get dict data
            _, _, _, document = output_predictions(
                None,
                self.trainer,
                batches,
                self.vocab,
                None,
                self.config.get('max_seqlen',
                                TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
                orig_text=raw_text,
                no_ssplit=self.config.get('no_ssplit', False))
        return doc.Document(document, raw_text)
Example #7
0
    def tokenize(self, text):
        """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
        """
        if not isinstance(text, str):
            raise Exception("Must supply a string to the spaCy tokenizer.")
        spacy_doc = self.nlp(text)

        sentences = []
        for sent in spacy_doc.sents:
            tokens = []
            for tok in sent:
                token_entry = {
                    doc.TEXT:
                    tok.text,
                    doc.MISC:
                    f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}"
                }
                tokens.append(token_entry)
            sentences.append(tokens)

        return doc.Document(sentences, text)
Example #8
0
    def process(self, document):
        """ Tokenize a document with the SudachiPy tokenizer and wrap the results into a Doc object.
        """
        if isinstance(document, doc.Document):
            text = document.text
        else:
            text = document
        if not isinstance(text, str):
            raise Exception("Must supply a string or Stanza Document object to the SudachiPy tokenizer.")

        # we use the default sudachipy tokenization mode (i.e., mode C)
        # more config needs to be added to support other modes

        tokens = self.tokenizer.tokenize(text)

        sentences = []
        current_sentence = []
        for token in tokens:
            token_text = token.surface()
            # by default sudachipy will output whitespace as a token
            # we need to skip these tokens to be consistent with other tokenizers
            if token_text.isspace():
                continue
            start = token.begin()
            end = token.end()

            token_entry = {
                doc.TEXT: token_text,
                doc.MISC: f"{doc.START_CHAR}={start}|{doc.END_CHAR}={end}"
            }
            current_sentence.append(token_entry)

            if not self.no_ssplit and token_text in ['。', '!', '?', '!', '?']:
                sentences.append(current_sentence)
                current_sentence = []

        if len(current_sentence) > 0:
            sentences.append(current_sentence)

        return doc.Document(sentences, text)