""" #Printing the datasets for data_batch, target_batch in dataset.take(1): print("data-->", data_batch) #.as_numpy_iterator()) print("Target-->", target_batch) #.as_numpy_iterator()) """ example_input_batch, example_target_batch = next(iter(dataset)) example_input_batch.shape, example_target_batch.shape from models import Encoder, BahdanauAttention, Decoder encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) # sample input sample_hidden = encoder.initialize_hidden_state() sample_output, sample_hidden = encoder(example_input_batch, sample_hidden) print('Encoder output shape: (batch size, sequence length, units) {}'.format( sample_output.shape)) print('Encoder Hidden state shape: (batch size, units) {}'.format( sample_hidden.shape)) attention_layer = BahdanauAttention(10) attention_result, attention_weights = attention_layer(sample_hidden, sample_output) print("Attention result shape: (batch size, units) {}".format( attention_result.shape)) print("Attention weights shape: (batch_size, sequence_length, 1) {}".format( attention_weights.shape))
def main(): parser = argparse.ArgumentParser(description='Translator tester') parser.add_argument("-b", "--batch_size", dest="batch_size", help="size of the batch", type=int, default=64) parser.add_argument("--emb_dim", dest="embedding_dim", help="embedding dimension", type=int, default=256) parser.add_argument("--units", dest="units", help="internal size of the recurrent layers", type=int, default=1024) parser.add_argument("--dataset_size", dest="num_examples", help="number of examples to train", type=int, default=30000) parser.add_argument("--epochs", dest="epochs", help="number of epochs", type=int, default=10) parser.add_argument("--seq_max_len", dest="seq_max_len", help="maximun length of the input sequence", type=int, default=500) args = parser.parse_args() tf.enable_eager_execution() path_to_zip = tf.keras.utils.get_file( 'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', extract=True) path_to_file = os.path.dirname(path_to_zip) + "/spa-eng/spa.txt" num_examples = args.num_examples input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset( path_to_file, num_examples) input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( input_tensor, target_tensor, test_size=0.2) BUFFER_SIZE = len(input_tensor_train) N_BATCH = BUFFER_SIZE // args.batch_size vocab_inp_size = len(inp_lang.word2idx) vocab_tar_size = len(targ_lang.word2idx) dataset = tf.data.Dataset.from_tensor_slices( (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE) dataset = dataset.batch(args.batch_size, drop_remainder=True) encoder = Encoder(vocab_inp_size, args.embedding_dim, args.units, args.batch_size, args.seq_max_len) decoder = Decoder(vocab_tar_size, args.embedding_dim, args.units, args.batch_size, args.seq_max_len) optimizer = tf.train.AdamOptimizer() checkpoint_dir = './training_checkpoints' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) optimizer = tf.train.AdamOptimizer() EPOCHS = args.epochs for epoch in range(EPOCHS): start = time.time() hidden = encoder.initialize_hidden_state() total_loss = 0 for (batch, (inp, targ)) in enumerate(dataset): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = encoder(inp, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * args.batch_size, 1) for t in range(1, targ.shape[1]): predictions, dec_hidden, _ = decoder( dec_input, dec_hidden, enc_output) loss += loss_function(targ[:, t], predictions) dec_input = tf.expand_dims(targ[:, t], 1) batch_loss = (loss / int(targ.shape[1])) total_loss += batch_loss variables = encoder.variables + decoder.variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format( epoch + 1, batch, batch_loss.numpy())) if (epoch + 1) % 2 == 0: checkpoint.save(file_prefix=checkpoint_prefix) print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start)) encoder.save_weights('encoder', save_format='tf') decoder.save_weights('decoder', save_format='tf') model_info = { 'VIS': vocab_inp_size, 'VTS': vocab_tar_size, 'ED': args.embedding_dim, 'UNITS': args.units, 'BZ': args.batch_size, 'DATASET': path_to_file, 'DSS': num_examples, 'MAX_SEQ_LEN': args.max_seq_len } with open('model_info.json', 'w') as outfile: json.dump(model_info, outfile)
def initialize_chatbot(): #print("initializing chatbot... \n") #print("Loading dictionary...") with open("model_data/vocab_dict.p", 'rb') as fp: vocab = pickle.load(fp) #print(f"Loaded {len(vocab)} words") #print("making sample embedding matrix...") sample_emb = tf.zeros((len(vocab), 100)) """ ENCODER WORK """ #print("Initializing Encoder...") encoder = Encoder(len(vocab), 100, 500, 128, sample_emb, num_layers=3, drop_prob=0.1) #print("Testing Encoder...") sample_hidden = encoder.initialize_hidden_state() ex_input_bt = tf.zeros((128, 25)) sample_output, sample_hidden = encoder(ex_input_bt, sample_hidden) assert sample_output.shape == (128, 25, 500) assert sample_hidden.shape == (128, 500) #print("Loading up encoder...") encoder.load_weights("model_data/encoder_gpu.h5") """ DECODER WORK """ #print("Initializing Decoder...") decoder = Decoder(len(vocab), 100, 500, 128, sample_emb, num_layers=3, drop_prob=0.1) #print("Testing Decoder...") sample_decoder_output, _, _ = decoder(tf.random.uniform((128, 1)), sample_hidden, sample_output) assert sample_decoder_output.shape == (128, len(vocab)) #print("Loading up decoder...") decoder.load_weights("model_data/decoder_gpu.h5") # inverse vocabulary inv_vocab = {v: k for k, v in vocab.items()} """ Some variables""" pad_token = 0 sos_token = 1 eos_token = 2 units = 500 maxl = 25 """Processing functions""" # Convert (or remove accents) sentence to non_accents sentence def unicodeToAscii(s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # Lowercase, trim, and remove non-letter characters def normalizeString(s): s = unicodeToAscii(s.lower().strip()) s = re.sub(r"([.!?])", r" \1", s) s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) s = re.sub(r"\s+", r" ", s).strip() return s return vocab, encoder, decoder, inv_vocab, pad_token, sos_token, eos_token, units, maxl, unicodeToAscii, normalizeString