def __init__(self, tokens=None, from_file=None, corpus_type='char', min_frequency=1, max_pad_length=None, sos_eos_tokens=True): """Initialization method. Args: tokens (list): A list of tokens. from_file (str): An input file to load the sentences. corpus_type (str): The desired type to tokenize the sentences. Should be `char` or `word`. min_frequency (int): Minimum frequency of individual tokens. max_pad_length (int): Maximum length to pad the tokens. sos_eos_tokens (bool): Whether start-of-sentence and end-of-sentence tokens should be used. """ logger.info('Overriding class: Corpus -> SentenceCorpus.') # Overrides its parent class with any custom arguments if needed super(SentenceCorpus, self).__init__(min_frequency=min_frequency) # Checks if there are not pre-loaded tokens if not tokens: # Loads the sentences from file sentences = l.load_txt(from_file).splitlines() # Creates a tokenizer based on desired type pipe = self._create_tokenizer(corpus_type) # Retrieve the tokens self.tokens = [pipe(sentence) for sentence in sentences] # If there are tokens else: # Gathers them to the property self.tokens = tokens # Cuts the tokens based on a minimum frequency self._check_token_frequency() # Pads the tokens before building the vocabulary self._pad_token(max_pad_length, sos_eos_tokens) # Builds the vocabulary based on the tokens self._build() # Debugging some important information logger.debug( 'Sentences: %d | Minimum frequency: %d | Maximum pad length: %s | ' 'Use <SOS> and <EOS>: %s | Vocabulary size: %d.', len(self.tokens), self.min_frequency, max_pad_length, sos_eos_tokens, len(self.vocab)) logger.info('SentenceCorpus created.')
def __init__(self, tokens=None, from_file=None, corpus_type='char', min_frequency=1): """Initialization method. Args: tokens (list): A list of tokens. from_file (str): An input file to load the text. corpus_type (str): The desired type to tokenize the text. Should be `char` or `word`. min_frequency (int): Minimum frequency of individual tokens. """ logger.info('Overriding class: Corpus -> TextCorpus.') super(TextCorpus, self).__init__(min_frequency=min_frequency) # Checks if there are not pre-loaded tokens if not tokens: # Loads the text from file text = l.load_txt(from_file) # Creates a tokenizer based on desired type pipe = self._create_tokenizer(corpus_type) # Retrieve the tokens self.tokens = pipe(text) else: # Gathers them to the property self.tokens = tokens # Cuts the tokens based on a minimum frequency self._check_token_frequency() # Builds the vocabulary based on the tokens self._build() logger.debug( 'Tokens: %d | Minimum frequency: %d | Vocabulary size: %d.', len(self.tokens), self.min_frequency, len(self.vocab)) logger.info('TextCorpus created.')
def __init__(self, tokens=None, from_file=None, corpus_type='char'): """Initialization method. Args: tokens (list): A list of tokens. from_file (str): An input file to load the text. corpus_type (str): The desired type to tokenize the text. Should be `char` or `word`. """ logger.info('Overriding class: Corpus -> TextCorpus.') # Overrides its parent class with any custom arguments if needed super(TextCorpus, self).__init__() # Checks if there are not pre-loaded tokens if not tokens: # Loads the text from file text = l.load_txt(from_file) # Creates a tokenizer based on desired type pipe = self._create_tokenizer(corpus_type) # Retrieve the tokens self.tokens = pipe(text) # If there are tokens else: # Gathers them to the property self.tokens = tokens # Builds the vocabulary based on the tokens self._build(self.tokens) # Debugging some important information logger.debug('Tokens: %d | Vocabulary Size: %d | Type: %s.', len(self.tokens), len(self.vocab), corpus_type) logger.info('TextCorpus created.')
import nalp.utils.loader as l import nalp.utils.preprocess as p # Loads an input .txt file text = l.load_txt('data/text/chapter1_harry.txt') # Creates a character pre-processing pipeline char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char) # Creates a word pre-processing pipeline word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word) # Applying character pre-processing pipeline to text chars_tokens = char_pipe(text) # Applying word pre-processing pipeline to text words_tokens = word_pipe(text) # Printing tokenized characters and words print(chars_tokens) print(words_tokens)
import nalp.utils.loader as l import nalp.utils.preprocess as p # Loads an input .txt file with sentences sentences = l.load_txt('data/sentence/coco_image_captions.txt').splitlines() # Creates character and word pre-processing pipelines char_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_char) word_pipe = p.pipeline(p.lower_case, p.valid_char, p.tokenize_to_word) # Applying character and word pre-processing pipelines to sentences chars_tokens = [char_pipe(sentence) for sentence in sentences] words_tokens = [word_pipe(sentence) for sentence in sentences] # Printing tokenized characters and words print(chars_tokens) print(words_tokens)