def _create_tokenizer(self, corpus_type):
        """Creates a tokenizer based on the input type.

        Args:
            corpus_type (str): A type to create the tokenizer. Should be `char` or `word`.

        Returns:
            The created tokenizer.

        """

        # Checks if type is possible
        if corpus_type not in ['char', 'word']:
            # If not, creates an error
            e = 'Corpus type should be `char` or `word`.'

            # Logs the error
            logger.error(e)

            raise RuntimeError(e)

        # Check is corpus type is `char`
        if corpus_type == 'char':
            return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char)

        # If not, return it as a `word` tokenizer
        return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)
Esempio n. 2
0
    def _create_tokenizer(self, corpus_type):
        """Creates a tokenizer based on the input type.

        Args:
            corpus_type (str): A type to create the tokenizer. Should be `char` or `word`.

        Returns:
            The created tokenizer.

        """

        # Checks if type is possible
        if corpus_type not in ['char', 'word']:
            # If not, creates an error
            e = 'Type argument should be `char` or `word`.'

            # Logs the error
            logger.error(e)

            raise RuntimeError(e)

        # If the type is char
        if corpus_type == 'char':
            return p.pipeline(p.tokenize_to_char)

        # If the type is word
        if corpus_type == 'word':
            return p.pipeline(p.tokenize_to_word)
Esempio n. 3
0
    def _create_tokenizer(self, corpus_type):
        """Creates a tokenizer based on the input type.

        Args:
            corpus_type (str): A type to create the tokenizer. Should be `char` or `word`.

        Returns:
            The created tokenizer.

        """

        if corpus_type not in ['char', 'word']:
            e = 'Corpus type should be `char` or `word`.'

            logger.error(e)

            raise RuntimeError(e)

        if corpus_type == 'char':
            return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char)

        return p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)
Esempio n. 4
0
import nalp.utils.loader as l
import nalp.utils.preprocess as p

# Loads an input .txt file
text = l.load_txt('data/text/chapter1_harry.txt')

# Creates a character pre-processing pipeline
char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char)

# Creates a word pre-processing pipeline
word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)

# Applying character pre-processing pipeline to text
chars_tokens = char_pipe(text)

# Applying word pre-processing pipeline to text
words_tokens = word_pipe(text)

# Printing tokenized characters and words
print(chars_tokens)
print(words_tokens)