Exemple #1
0
from nalp.corpus import TextCorpus
from nalp.datasets import LanguageModelingDataset
from nalp.encoders import IntegerEncoder

# Creating a character TextCorpus from file
corpus = TextCorpus(from_file='data/text/chapter1_harry.txt',
                    corpus_type='char')

# Creating an IntegerEncoder, learning encoding and encoding tokens
encoder = IntegerEncoder()
encoder.learn(corpus.vocab_index, corpus.index_vocab)
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens,
                                  max_contiguous_pad_length=10,
                                  batch_size=1,
                                  shuffle=True)

# Iterating over one batch
for input_batch, target_batch in dataset.batches.take(1):
    # For every input and target inside the batch
    for x, y in zip(input_batch, target_batch):
        # Transforms the tensor to numpy and decodes it
        print(encoder.decode(x.numpy()), encoder.decode(y.numpy()))
from nalp.utils.transformer_utils import CustomSchedule
from nalp.utils.constants import BUFFER_SIZE

if __name__ == "__main__":
    model_name = "ted_hrlr_translate_pt_en_converter"
    tf.keras.utils.get_file(
        f"{model_name}.zip",
        f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
        cache_dir='.',
        cache_subdir='',
        extract=True)
    tokenizers = tf.saved_model.load(model_name)

    # Creating a character TextCorpus from file
    corpus = TextCorpus(
        from_file='../../data/text/news_korean_to_english_google_pbmt.txt',
        corpus_type="word")

    sequences = corpus.sequences.copy()

    dataset = tf.data.Dataset.from_tensor_slices(sequences)

    def tokenize(en):
        en = tokenizers.en.tokenize(en)
        # Convert from ragged to dense, padding with zeros.
        en = en.to_tensor()
        return en

    BUFFER_SIZE = 20000
    BATCH_SIZE = 64
Exemple #3
0
from nalp.corpus import TextCorpus

# Creating a character TextCorpus from file
corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='char', min_frequency=1)

# Creating a word TextCorpus from file
# corpus = TextCorpus(from_file='data/text/chapter1_harry.txt', corpus_type='word', min_frequency=1)

# Accessing TextCorpus properties
print(corpus.tokens)
print(corpus.vocab, corpus.vocab_size)
print(corpus.vocab_index, corpus.index_vocab)
Exemple #4
0
import tensorflow as tf
from nalp.corpus import TextCorpus
from nalp.datasets import LanguageModelingDataset
from nalp.encoders import IntegerEncoder
from nalp.models.generators import LSTMGenerator

from opytimizer import Opytimizer
from opytimizer.core import Function
from opytimizer.optimizers.swarm import PSO
from opytimizer.spaces import SearchSpace

# Creates a character TextCorpus from file
corpus = TextCorpus(from_file="examples/integrations/nalp/chapter1_harry.txt",
                    corpus_type="char")

# Creating an IntegerEncoder, learning encoding and encoding tokens
encoder = IntegerEncoder()
encoder.learn(corpus.vocab_index, corpus.index_vocab)
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens,
                                  max_contiguous_pad_length=10,
                                  batch_size=64)


def lstm(opytimizer):
    # Gathers parameters from Opytimizer
    # Pay extremely attention to their order when declaring due to their bounds
    learning_rate = opytimizer[0][0]
Exemple #5
0
import tensorflow as tf

from nalp.corpus import TextCorpus
from nalp.datasets import LanguageModelingDataset
from nalp.encoders import IntegerEncoder
from nalp.models.generators import LSTMGenerator

# Creating a character TextCorpus from file
corpus = TextCorpus(from_file='iliad.txt', corpus_type='word')

# Creating an IntegerEncoder
encoder = IntegerEncoder()

# Learns the encoding based on the TextCorpus dictionary and reverse dictionary
encoder.learn(corpus.vocab_index, corpus.index_vocab)

# Applies the encoding on new data
encoded_tokens = encoder.encode(corpus.tokens)

# Creating Language Modeling Dataset
dataset = LanguageModelingDataset(encoded_tokens, max_length=25, batch_size=128)

# Creating the LSTM
lstm = LSTMGenerator(encoder=encoder, vocab_size=corpus.vocab_size, embedding_size=256, hidden_size=1024)

# As NALP's LSTMs are stateful, we need to build it with a fixed batch size
lstm.build((128, None))

# Compiling the LSTM
lstm.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001),
            loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),