def _create_tokenizer(self, corpus_type): """Creates a tokenizer based on the input type. Args: corpus_type (str): A type to create the tokenizer. Should be `char` or `word`. Returns: The created tokenizer. """ # Checks if type is possible if corpus_type not in ['char', 'word']: # If not, creates an error e = 'Corpus type should be `char` or `word`.' # Logs the error logger.error(e) raise RuntimeError(e) # Check is corpus type is `char` if corpus_type == 'char': return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char) # If not, return it as a `word` tokenizer return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)
def _create_tokenizer(self, corpus_type): """Creates a tokenizer based on the input type. Args: corpus_type (str): A type to create the tokenizer. Should be `char` or `word`. Returns: The created tokenizer. """ # Checks if type is possible if corpus_type not in ['char', 'word']: # If not, creates an error e = 'Type argument should be `char` or `word`.' # Logs the error logger.error(e) raise RuntimeError(e) # If the type is char if corpus_type == 'char': return p.pipeline(p.tokenize_to_char) # If the type is word if corpus_type == 'word': return p.pipeline(p.tokenize_to_word)
def _create_tokenizer(self, corpus_type): """Creates a tokenizer based on the input type. Args: corpus_type (str): A type to create the tokenizer. Should be `char` or `word`. Returns: The created tokenizer. """ if corpus_type not in ['char', 'word']: e = 'Corpus type should be `char` or `word`.' logger.error(e) raise RuntimeError(e) if corpus_type == 'char': return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char) return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)
import nalp.utils.loader as l import nalp.utils.preprocess as p # Loads an input .txt file text = l.load_txt('data/text/chapter1_harry.txt') # Creates a character pre-processing pipeline char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char) # Creates a word pre-processing pipeline word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word) # Applying character pre-processing pipeline to text chars_tokens = char_pipe(text) # Applying word pre-processing pipeline to text words_tokens = word_pipe(text) # Printing tokenized characters and words print(chars_tokens) print(words_tokens)