def process(self, document): assert isinstance(document, str) or isinstance(document, doc.Document) or (self.config.get('pretokenized') or self.config.get('no_ssplit', False)), \ "If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object." if isinstance(document, doc.Document): if self.config.get('pretokenized'): return document document = document.text if self.config.get('pretokenized'): raw_text, document = self.process_pre_tokenized_text(document) return doc.Document(document, raw_text) if hasattr(self, '_variant'): return self._variant.process(document) raw_text = '\n\n'.join(document) if isinstance(document, list) else document # set up batches batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True) # get dict data _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None, self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT), orig_text=raw_text, no_ssplit=self.config.get('no_ssplit', False)) return doc.Document(document, raw_text)
def bulk_process(self, docs): """ The tokenizer cannot use UDProcessor's sentence-level cross-document batching interface, and requires special handling. Essentially, this method concatenates the text of multiple documents with "\n\n", tokenizes it with the neural tokenizer, then splits the result into the original Documents and recovers the original character offsets. """ if hasattr(self, '_variant'): return self._variant.bulk_process(docs) if self.config.get('pretokenized'): res = [] for document in docs: raw_text, document = self.process_pre_tokenized_text(document.text) res.append(doc.Document(document, raw_text)) return res combined_text = '\n\n'.join([thisdoc.text for thisdoc in docs]) processed_combined = self.process(doc.Document([], text=combined_text)) # postprocess sentences and tokens to reset back pointers and char offsets charoffset = 0 sentst = senten = 0 for thisdoc in docs: while senten < len(processed_combined.sentences) and processed_combined.sentences[senten].tokens[-1].end_char - charoffset <= len(thisdoc.text): senten += 1 sentences = processed_combined.sentences[sentst:senten] thisdoc.sentences = sentences for sent in sentences: # fix doc back pointers for sentences sent._doc = thisdoc # fix char offsets for tokens and words for token in sent.tokens: token._start_char -= charoffset token._end_char -= charoffset if token.words: # not-yet-processed MWT can leave empty tokens for word in token.words: word._start_char -= charoffset word._end_char -= charoffset thisdoc.num_tokens = sum(len(sent.tokens) for sent in sentences) thisdoc.num_words = sum(len(sent.words) for sent in sentences) sentst = senten charoffset += len(thisdoc.text) + 2 return docs
def process(self, text): """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object. """ if not isinstance(text, str): raise Exception("Must supply a string to the Jieba tokenizer.") tokens = self.nlp.cut(text, cut_all=False) sentences = [] current_sentence = [] offset = 0 for token in tokens: if re.match(r'\s+', token): offset += len(token) continue token_entry = { doc.TEXT: token, doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}" } current_sentence.append(token_entry) offset += len(token) if token in ['。', '!', '?', '!', '?']: sentences.append(current_sentence) current_sentence = [] if len(current_sentence) > 0: sentences.append(current_sentence) return doc.Document(sentences, text)
def process(self, document): """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object. """ if isinstance(document, doc.Document): text = document.text else: text = document if not isinstance(text, str): raise Exception( "Must supply a string or Stanza Document object to the spaCy tokenizer." ) spacy_doc = self.nlp(text) sentences = [] for sent in spacy_doc.sents: tokens = [] for tok in sent: token_entry = { doc.TEXT: tok.text, doc.MISC: f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}" } tokens.append(token_entry) sentences.append(tokens) # if no_ssplit is set, flatten all the sentences into one sentence if self.no_ssplit: sentences = [[t for s in sentences for t in s]] return doc.Document(sentences, text)
def process(self, document): """ Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object. """ if isinstance(document, doc.Document): text = document.text else: text = document if not isinstance(text, str): raise Exception( "Must supply a string or Stanza Document object to the PyThaiNLP tokenizer." ) sentences = [] current_sentence = [] offset = 0 if self.no_ssplit: # skip sentence segmentation sent_strs = [text] else: sent_strs = self.pythai_sent_tokenize(text, engine='crfcut') for sent_str in sent_strs: for token_str in self.pythai_word_tokenize(sent_str, engine='newmm'): # by default pythainlp will output whitespace as a token # we need to skip these tokens to be consistent with other tokenizers if token_str.isspace(): offset += len(token_str) continue # create token entry token_entry = { doc.TEXT: token_str, doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token_str)}" } current_sentence.append(token_entry) offset += len(token_str) # finish sentence sentences.append(current_sentence) current_sentence = [] if len(current_sentence) > 0: sentences.append(current_sentence) return doc.Document(sentences, text)
def process(self, document): assert isinstance(document, str) or isinstance(document, doc.Document) or (self.config.get('pretokenized') or self.config.get('no_ssplit', False)), \ "If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object." if isinstance(document, doc.Document): document = document.text if self.config.get('pretokenized'): raw_text, document = self.process_pre_tokenized_text(document) elif hasattr(self, '_variant'): return self._variant.process(document) else: raw_text = '\n\n'.join(document) if isinstance(document, list) else document # set up batches if self.config.get('lang') == 'vi': # special processing is due for Vietnamese text = '\n\n'.join([x for x in raw_text.split('\n\n')]).rstrip() dummy_labels = '\n\n'.join( ['0' * len(x) for x in text.split('\n\n')]) data = paras_to_chunks(text, dummy_labels) batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True) else: batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True) # get dict data _, _, _, document = output_predictions( None, self.trainer, batches, self.vocab, None, self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT), orig_text=raw_text, no_ssplit=self.config.get('no_ssplit', False)) return doc.Document(document, raw_text)
def tokenize(self, text): """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object. """ if not isinstance(text, str): raise Exception("Must supply a string to the spaCy tokenizer.") spacy_doc = self.nlp(text) sentences = [] for sent in spacy_doc.sents: tokens = [] for tok in sent: token_entry = { doc.TEXT: tok.text, doc.MISC: f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}" } tokens.append(token_entry) sentences.append(tokens) return doc.Document(sentences, text)
def process(self, document): """ Tokenize a document with the SudachiPy tokenizer and wrap the results into a Doc object. """ if isinstance(document, doc.Document): text = document.text else: text = document if not isinstance(text, str): raise Exception("Must supply a string or Stanza Document object to the SudachiPy tokenizer.") # we use the default sudachipy tokenization mode (i.e., mode C) # more config needs to be added to support other modes tokens = self.tokenizer.tokenize(text) sentences = [] current_sentence = [] for token in tokens: token_text = token.surface() # by default sudachipy will output whitespace as a token # we need to skip these tokens to be consistent with other tokenizers if token_text.isspace(): continue start = token.begin() end = token.end() token_entry = { doc.TEXT: token_text, doc.MISC: f"{doc.START_CHAR}={start}|{doc.END_CHAR}={end}" } current_sentence.append(token_entry) if not self.no_ssplit and token_text in ['。', '!', '?', '!', '?']: sentences.append(current_sentence) current_sentence = [] if len(current_sentence) > 0: sentences.append(current_sentence) return doc.Document(sentences, text)