def __init__(self,
                 tokens=None,
                 from_file=None,
                 corpus_type='char',
                 min_frequency=1,
                 max_pad_length=None,
                 sos_eos_tokens=True):
        """Initialization method.

        Args:
            tokens (list): A list of tokens.
            from_file (str): An input file to load the sentences.
            corpus_type (str): The desired type to tokenize the sentences. Should be `char` or `word`.
            min_frequency (int): Minimum frequency of individual tokens.
            max_pad_length (int): Maximum length to pad the tokens.
            sos_eos_tokens (bool): Whether start-of-sentence and end-of-sentence tokens should be used.

        """

        logger.info('Overriding class: Corpus -> SentenceCorpus.')

        # Overrides its parent class with any custom arguments if needed
        super(SentenceCorpus, self).__init__(min_frequency=min_frequency)

        # Checks if there are not pre-loaded tokens
        if not tokens:
            # Loads the sentences from file
            sentences = l.load_txt(from_file).splitlines()

            # Creates a tokenizer based on desired type
            pipe = self._create_tokenizer(corpus_type)

            # Retrieve the tokens
            self.tokens = [pipe(sentence) for sentence in sentences]

        # If there are tokens
        else:
            # Gathers them to the property
            self.tokens = tokens

        # Cuts the tokens based on a minimum frequency
        self._check_token_frequency()

        # Pads the tokens before building the vocabulary
        self._pad_token(max_pad_length, sos_eos_tokens)

        # Builds the vocabulary based on the tokens
        self._build()

        # Debugging some important information
        logger.debug(
            'Sentences: %d | Minimum frequency: %d | Maximum pad length: %s | '
            'Use <SOS> and <EOS>: %s | Vocabulary size: %d.', len(self.tokens),
            self.min_frequency, max_pad_length, sos_eos_tokens,
            len(self.vocab))
        logger.info('SentenceCorpus created.')
Ejemplo n.º 2
0
    def __init__(self,
                 tokens=None,
                 from_file=None,
                 corpus_type='char',
                 min_frequency=1):
        """Initialization method.

        Args:
            tokens (list): A list of tokens.
            from_file (str): An input file to load the text.
            corpus_type (str): The desired type to tokenize the text. Should be `char` or `word`.
            min_frequency (int): Minimum frequency of individual tokens.

        """

        logger.info('Overriding class: Corpus -> TextCorpus.')

        super(TextCorpus, self).__init__(min_frequency=min_frequency)

        # Checks if there are not pre-loaded tokens
        if not tokens:
            # Loads the text from file
            text = l.load_txt(from_file)

            # Creates a tokenizer based on desired type
            pipe = self._create_tokenizer(corpus_type)

            # Retrieve the tokens
            self.tokens = pipe(text)

        else:
            # Gathers them to the property
            self.tokens = tokens

        # Cuts the tokens based on a minimum frequency
        self._check_token_frequency()

        # Builds the vocabulary based on the tokens
        self._build()

        logger.debug(
            'Tokens: %d | Minimum frequency: %d | Vocabulary size: %d.',
            len(self.tokens), self.min_frequency, len(self.vocab))
        logger.info('TextCorpus created.')
Ejemplo n.º 3
0
    def __init__(self, tokens=None, from_file=None, corpus_type='char'):
        """Initialization method.

        Args:
            tokens (list): A list of tokens.
            from_file (str): An input file to load the text.
            corpus_type (str): The desired type to tokenize the text. Should be `char` or `word`.

        """

        logger.info('Overriding class: Corpus -> TextCorpus.')

        # Overrides its parent class with any custom arguments if needed
        super(TextCorpus, self).__init__()

        # Checks if there are not pre-loaded tokens
        if not tokens:
            # Loads the text from file
            text = l.load_txt(from_file)

            # Creates a tokenizer based on desired type
            pipe = self._create_tokenizer(corpus_type)

            # Retrieve the tokens
            self.tokens = pipe(text)

        # If there are tokens
        else:
            # Gathers them to the property
            self.tokens = tokens

        # Builds the vocabulary based on the tokens
        self._build(self.tokens)

        # Debugging some important information
        logger.debug('Tokens: %d | Vocabulary Size: %d | Type: %s.',
                     len(self.tokens), len(self.vocab), corpus_type)
        logger.info('TextCorpus created.')
Ejemplo n.º 4
0
import nalp.utils.loader as l
import nalp.utils.preprocess as p

# Loads an input .txt file
text = l.load_txt('data/text/chapter1_harry.txt')

# Creates a character pre-processing pipeline
char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char)

# Creates a word pre-processing pipeline
word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)

# Applying character pre-processing pipeline to text
chars_tokens = char_pipe(text)

# Applying word pre-processing pipeline to text
words_tokens = word_pipe(text)

# Printing tokenized characters and words
print(chars_tokens)
print(words_tokens)
Ejemplo n.º 5
0
import nalp.utils.loader as l
import nalp.utils.preprocess as p

# Loads an input .txt file with sentences
sentences = l.load_txt('data/sentence/coco_image_captions.txt').splitlines()

# Creates character and word pre-processing pipelines
char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char)
word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word)

# Applying character and word pre-processing pipelines to sentences
chars_tokens = [char_pipe(sentence) for sentence in sentences]
words_tokens = [word_pipe(sentence) for sentence in sentences]

# Printing tokenized characters and words
print(chars_tokens)
print(words_tokens)