Esempio n. 1
0
File: main.py Progetto: shravanc/tf
"""
#Printing the datasets
for data_batch, target_batch in dataset.take(1):
  print("data-->", data_batch) #.as_numpy_iterator())
  print("Target-->", target_batch) #.as_numpy_iterator())
"""

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

from models import Encoder, BahdanauAttention, Decoder

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(
    sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(
    sample_hidden.shape))

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden,
                                                      sample_output)

print("Attention result shape: (batch size, units) {}".format(
    attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(
    attention_weights.shape))
def main():
    parser = argparse.ArgumentParser(description='Translator tester')
    parser.add_argument("-b",
                        "--batch_size",
                        dest="batch_size",
                        help="size of the batch",
                        type=int,
                        default=64)
    parser.add_argument("--emb_dim",
                        dest="embedding_dim",
                        help="embedding dimension",
                        type=int,
                        default=256)
    parser.add_argument("--units",
                        dest="units",
                        help="internal size of the recurrent layers",
                        type=int,
                        default=1024)
    parser.add_argument("--dataset_size",
                        dest="num_examples",
                        help="number of examples to train",
                        type=int,
                        default=30000)
    parser.add_argument("--epochs",
                        dest="epochs",
                        help="number of epochs",
                        type=int,
                        default=10)
    parser.add_argument("--seq_max_len",
                        dest="seq_max_len",
                        help="maximun length of the input sequence",
                        type=int,
                        default=500)

    args = parser.parse_args()

    tf.enable_eager_execution()

    path_to_zip = tf.keras.utils.get_file(
        'spa-eng.zip',
        origin='http://download.tensorflow.org/data/spa-eng.zip',
        extract=True)
    path_to_file = os.path.dirname(path_to_zip) + "/spa-eng/spa.txt"

    num_examples = args.num_examples
    input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(
        path_to_file, num_examples)

    input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
        input_tensor, target_tensor, test_size=0.2)

    BUFFER_SIZE = len(input_tensor_train)
    N_BATCH = BUFFER_SIZE // args.batch_size
    vocab_inp_size = len(inp_lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
    dataset = dataset.batch(args.batch_size, drop_remainder=True)

    encoder = Encoder(vocab_inp_size, args.embedding_dim, args.units,
                      args.batch_size, args.seq_max_len)
    decoder = Decoder(vocab_tar_size, args.embedding_dim, args.units,
                      args.batch_size, args.seq_max_len)

    optimizer = tf.train.AdamOptimizer()

    checkpoint_dir = './training_checkpoints'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                     encoder=encoder,
                                     decoder=decoder)

    optimizer = tf.train.AdamOptimizer()

    EPOCHS = args.epochs

    for epoch in range(EPOCHS):
        start = time.time()

        hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset):
            loss = 0

            with tf.GradientTape() as tape:
                enc_output, enc_hidden = encoder(inp, hidden)
                dec_hidden = enc_hidden
                dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] *
                                           args.batch_size, 1)

                for t in range(1, targ.shape[1]):
                    predictions, dec_hidden, _ = decoder(
                        dec_input, dec_hidden, enc_output)
                    loss += loss_function(targ[:, t], predictions)
                    dec_input = tf.expand_dims(targ[:, t], 1)

            batch_loss = (loss / int(targ.shape[1]))
            total_loss += batch_loss
            variables = encoder.variables + decoder.variables
            gradients = tape.gradient(loss, variables)
            optimizer.apply_gradients(zip(gradients, variables))

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch, batch_loss.numpy()))
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)

        print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

    encoder.save_weights('encoder', save_format='tf')
    decoder.save_weights('decoder', save_format='tf')

    model_info = {
        'VIS': vocab_inp_size,
        'VTS': vocab_tar_size,
        'ED': args.embedding_dim,
        'UNITS': args.units,
        'BZ': args.batch_size,
        'DATASET': path_to_file,
        'DSS': num_examples,
        'MAX_SEQ_LEN': args.max_seq_len
    }

    with open('model_info.json', 'w') as outfile:
        json.dump(model_info, outfile)
Esempio n. 3
0
def initialize_chatbot():
    #print("initializing chatbot... \n")
    #print("Loading dictionary...")
    with open("model_data/vocab_dict.p", 'rb') as fp:
        vocab = pickle.load(fp)
    #print(f"Loaded {len(vocab)} words")

    #print("making sample embedding matrix...")
    sample_emb = tf.zeros((len(vocab), 100))
    """ ENCODER WORK """
    #print("Initializing Encoder...")
    encoder = Encoder(len(vocab),
                      100,
                      500,
                      128,
                      sample_emb,
                      num_layers=3,
                      drop_prob=0.1)

    #print("Testing Encoder...")
    sample_hidden = encoder.initialize_hidden_state()
    ex_input_bt = tf.zeros((128, 25))
    sample_output, sample_hidden = encoder(ex_input_bt, sample_hidden)
    assert sample_output.shape == (128, 25, 500)
    assert sample_hidden.shape == (128, 500)

    #print("Loading up encoder...")
    encoder.load_weights("model_data/encoder_gpu.h5")
    """ DECODER WORK """
    #print("Initializing Decoder...")
    decoder = Decoder(len(vocab),
                      100,
                      500,
                      128,
                      sample_emb,
                      num_layers=3,
                      drop_prob=0.1)
    #print("Testing Decoder...")
    sample_decoder_output, _, _ = decoder(tf.random.uniform((128, 1)),
                                          sample_hidden, sample_output)
    assert sample_decoder_output.shape == (128, len(vocab))

    #print("Loading up decoder...")
    decoder.load_weights("model_data/decoder_gpu.h5")

    # inverse vocabulary
    inv_vocab = {v: k for k, v in vocab.items()}
    """ Some variables"""
    pad_token = 0
    sos_token = 1
    eos_token = 2
    units = 500
    maxl = 25
    """Processing functions"""

    # Convert (or remove accents) sentence to non_accents sentence
    def unicodeToAscii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
                       if unicodedata.category(c) != 'Mn')

    # Lowercase, trim, and remove non-letter characters
    def normalizeString(s):
        s = unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub(r"\s+", r" ", s).strip()
        return s

    return vocab, encoder, decoder, inv_vocab, pad_token, sos_token, eos_token, units, maxl, unicodeToAscii, normalizeString