from nalp.corpus import TextCorpus from nalp.datasets import LanguageModelingDataset from nalp.encoders import IntegerEncoder # Creating a character TextCorpus from file corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char') # Creating an IntegerEncoder, learning encoding and encoding tokens encoder = IntegerEncoder() encoder.learn(corpus.vocab_index, corpus.index_vocab) encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_contiguous_pad_length=10, batch_size=1, shuffle=True) # Iterating over one batch for input_batch, target_batch in dataset.batches.take(1): # For every input and target inside the batch for x, y in zip(input_batch, target_batch): # Transforms the tensor to numpy and decodes it print(encoder.decode(x.numpy()), encoder.decode(y.numpy()))
from nalp.utils.transformer_utils import CustomSchedule from nalp.utils.constants import BUFFER_SIZE if __name__ == "__main__": model_name = "ted_hrlr_translate_pt_en_converter" tf.keras.utils.get_file( f"{model_name}.zip", f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip", cache_dir='.', cache_subdir='', extract=True) tokenizers = tf.saved_model.load(model_name) # Creating a character TextCorpus from file corpus = TextCorpus( from_file='../../data/text/news_korean_to_english_google_pbmt.txt', corpus_type="word") sequences = corpus.sequences.copy() dataset = tf.data.Dataset.from_tensor_slices(sequences) def tokenize(en): en = tokenizers.en.tokenize(en) # Convert from ragged to dense, padding with zeros. en = en.to_tensor() return en BUFFER_SIZE = 20000 BATCH_SIZE = 64
from nalp.corpus import TextCorpus # Creating a character TextCorpus from file corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char', min_frequency=1) # Creating a word TextCorpus from file # corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='word', min_frequency=1) # Accessing TextCorpus properties print(corpus.tokens) print(corpus.vocab, corpus.vocab_size) print(corpus.vocab_index, corpus.index_vocab)
import tensorflow as tf from nalp.corpus import TextCorpus from nalp.datasets import LanguageModelingDataset from nalp.encoders import IntegerEncoder from nalp.models.generators import LSTMGenerator from opytimizer import Opytimizer from opytimizer.core import Function from opytimizer.optimizers.swarm import PSO from opytimizer.spaces import SearchSpace # Creates a character TextCorpus from file corpus = TextCorpus(from_file="examples/integrations/nalp/chapter1_harry.txt", corpus_type="char") # Creating an IntegerEncoder, learning encoding and encoding tokens encoder = IntegerEncoder() encoder.learn(corpus.vocab_index, corpus.index_vocab) encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_contiguous_pad_length=10, batch_size=64) def lstm(opytimizer): # Gathers parameters from Opytimizer # Pay extremely attention to their order when declaring due to their bounds learning_rate = opytimizer[0][0]
import tensorflow as tf from nalp.corpus import TextCorpus from nalp.datasets import LanguageModelingDataset from nalp.encoders import IntegerEncoder from nalp.models.generators import LSTMGenerator # Creating a character TextCorpus from file corpus = TextCorpus(from_file='iliad.txt', corpus_type='word') # Creating an IntegerEncoder encoder = IntegerEncoder() # Learns the encoding based on the TextCorpus dictionary and reverse dictionary encoder.learn(corpus.vocab_index, corpus.index_vocab) # Applies the encoding on new data encoded_tokens = encoder.encode(corpus.tokens) # Creating Language Modeling Dataset dataset = LanguageModelingDataset(encoded_tokens, max_length=25, batch_size=128) # Creating the LSTM lstm = LSTMGenerator(encoder=encoder, vocab_size=corpus.vocab_size, embedding_size=256, hidden_size=1024) # As NALP's LSTMs are stateful, we need to build it with a fixed batch size lstm.build((128, None)) # Compiling the LSTM lstm.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),