def main(data_dir):
    x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir)

    batch_size = 128
    max_vocab_size = 20000
    max_seq_len = 30
    embedding_dim = 100
    lstm_dim = 128
   

    vectorizer = TextVectorization(max_tokens=max_vocab_size,
                                   output_sequence_length=max_seq_len)
    text_data = tf.data.Dataset.from_tensor_slices(x_train).batch(batch_size)
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    vocab = vectorizer.get_vocabulary()
    # load pre-trained w2v model 
    w2v = Word2Vec.load(os.path.join(data_dir, 'processed/w2v.model'))
    print('Building embedding matrix')
    # This matrix will be used to initialze weights in the embedding layer
    embedding_matrix = build_embedding_mat(data_dir, vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))

    X_train = vectorizer(np.array([[s] for s in x_train])).numpy()
    X_val = vectorizer(np.array([[s] for s in x_val])).numpy()
    X_test = vectorizer(np.array([[s] for s in x_test])).numpy()
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    y_test = np.array(y_test)
    acc_scores={}
    dropout=0.7
    for layer in ['sigmoid','relu','tanh']:
        print("Building the model with ",layer," and dropout ",dropout)
        model = Sequential()
        model.add(Embedding(input_dim=max_vocab_size+3, output_dim=100, input_length=max_seq_len, weights = [embedding_matrix], trainable=True))
        model.add(Flatten())
        model.add(Dense(lstm_dim,activation=layer
                            , kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
            ))
        model.add(Dropout(dropout))
        model.add(Dense(2,activation='softmax',name='output_layer'))

        print(model.summary())

        print("Compiling the model")
        model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
        
        print("Fitting the model")
        model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_data=(X_val, y_val))
        scores = model.evaluate(X_val, y_val)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        acc_scores[layer+"_val"+str(dropout)] = scores[1]*100
        print("Evaluating model on test data")
        scores = model.evaluate(X_test, y_test)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        acc_scores[layer+"_test"+str(dropout)] = scores[1]*100
        # model.save(os.path.join(data_dir, 'processed/'+layer+str(dropout)) )
        model.save(os.path.join(data_dir, 'processed/'+layer+'.model'))
    print(acc_scores)
Exemple #2
0
def vocab_maker(data, max_dic_size, batch_size):
    # Create a vocabulary of the recommended size-1 for pad and out of range
    vectorizer = TextVectorization(max_tokens=max_dic_size-1,
                                   output_mode='int')
    text_data = tf.data.Dataset.from_tensor_slices(data).batch(batch_size)
    vectorizer.adapt(text_data)
    # index 0 and 1 are reserved values for padding and out of dic
    vocab = vectorizer.get_vocabulary()
    vocab = [x.decode('utf-8') for x in vocab]
    return vocab
Exemple #3
0
def main(data_dir):
    print('Loading data')
    x_train_val, x_test = load_data(data_dir)
    # decrease dataset size for quick testing
    # x_train_val = x_train_val[:1000]
    # x_test = x_test[:100]

    # build vocab
    # NOTE: this script only considers tokens in the training set to build the
    # vocabulary object.
    vectorizer = TextVectorization(
        max_tokens=config['max_vocab_size'],
        output_sequence_length=config['max_seq_len'])
    text_data = tf.data.Dataset.from_tensor_slices(x_train_val).batch(
        config['batch_size'])
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    # NOTE: in this vocab, index 0 is reserved for padding and 1 is reserved
    # for out of vocabulary tokens
    vocab = vectorizer.get_vocabulary()

    # load pre-trained w2v model (this model was trained in tut_1)
    w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model'))
    print('Building embedding matrix')
    # This matrix will be used to initialze weights in the embedding layer
    embedding_matrix, word2token = build_embedding_mat(data_dir, vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))

    print('Building Seq2Seq model')

    # build the embedding layer to convert token sequences into embeddings
    # set trainable to True if you wish to further finetune the embeddings.
    # It will increase train time but may yield better results. Try it out
    # on a more complex task (like neural machine translation)!
    embedding_layer = Embedding(
        input_dim=len(vocab) + 4,
        output_dim=config['embedding_dim'],
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )

    # build the encoding layers
    # encoder_inputs accepts padded tokenized sequences as input,
    # which would be converted to embeddings by the embedding_layer
    # finally, the embedded sequences are fed to the encoder LSTM to get
    # encodings (or vector representation) of the input sentences
    # you can add droputs the input/embedding layers and make your model robust
    encoder_inputs = Input((None, ), name='enc_inp')
    enc_embedding = embedding_layer(encoder_inputs)
    # you can choose a GRU/Dense layer as well to keep things easier
    # note that we are not using the encoder_outputs for the given generative
    # task, but you'll need it for classification
    # Also, there hidden dimension is currently equal to the embedding dimension
    _, state_h, state_c = LSTM(
        config['embedding_dim'],  # try a different value
        return_state=True,
        name='enc_lstm')(enc_embedding)
    encoder_states = [state_h, state_c]

    # build the decoding layers
    # decoder_inputs and dec_embedding serve similar purposes as in the encoding
    # layers. Note that we are using the same embedding_layer to convert
    # token sequences to embeddings while encoding and decoding.
    # In this case, we initialize the decoder using `encoder_states`
    # as its initial state (i.e. vector representation learned by the encoder).
    decoder_inputs = Input((None, ), name='dec_inp')
    dec_embedding = embedding_layer(decoder_inputs)
    dec_lstm = LSTM(config['embedding_dim'],
                    return_state=True,
                    return_sequences=True,
                    name='dec_lstm')
    dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=encoder_states)
    # finally, we add a final fully connected layer which performs the
    # transformation of decoder outputs to logits vectors
    dec_dense = Dense(len(vocab) + 4, activation='softmax', name='out')
    output = dec_dense(dec_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    print(model.summary())

    # note that decoder_input_data and decoder_target_data will be same
    # as we are training a vanilla autoencoder
    # we are using np.ones as pad tokens are represented by 1 in our vocab
    # TODO: switch to a generator instead of creating such huge matrics.
    # will reduce memory consumption a lot.
    encoder_input_data = np.ones((len(x_train_val), config['max_seq_len']),
                                 dtype='float32')
    decoder_input_data = np.ones((len(x_train_val), config['max_seq_len']),
                                 dtype='float32')
    decoder_target_data = np.zeros(
        (len(x_train_val), config['max_seq_len'], len(vocab) + 4),
        dtype='float32')

    for i, input_text in enumerate(x_train_val):
        tokenized_text = tokenize(input_text, word2token)
        for j in range(len(tokenized_text)):
            encoder_input_data[i, j] = tokenized_text[j]
            decoder_input_data[i, j] = tokenized_text[j]
            decoder_target_data[i, j, tokenized_text[j]] = 1.0
    # Run training (will take some time)
    print('Training model')
    # try different optimizers, learning rates, and analyze different metrics
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=config['batch_size'],
        epochs=10,  # try increasin #epochs
        validation_split=0.2)
    # Save model
    # this model is saved inside the tut_3/data folder just to showcase how
    # you can save your models as well inside respective assignment folders
    # and use them later
    model.save('tut_3/data/ae.model')
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

"""
## Decoding test sentences

Finally, let's demonstrate how to translate brand new English sentences.
We simply feed into the model the vectorized English sentence
as well as the target token `"[start]"`, then we repeatedly generated the next token, until
we hit the token `"[end]"`.
"""

spa_vocab = spa_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
Exemple #5
0
    return tf.strings.regex_replace(lowercase,
                                    '[%s]' % re.escape(string.punctuation), '')


# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Create vocabulary
vectorize_layer.adapt(text_ds.batch(1024))
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

# Make senquences
sequences = list(text_vector_ds.as_numpy_iterator())

# Embedding dim
embedding_dim = 128

########### Generate skip-grams and training data

# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
Exemple #6
0
def main(data_dir):
    print('Loading data')
    x_train, x_val, x_test, y_train, y_val, y_test = load_data(data_dir)
    # build vocabulary
    vectorizer = TextVectorization(
        max_tokens=config['max_vocab_size'],
        output_sequence_length=config['max_seq_len'])
    text_data = tf.data.Dataset.from_tensor_slices(x_train).batch(
        config['batch_size'])
    print('Building vocabulary')
    vectorizer.adapt(text_data)
    vocab = vectorizer.get_vocabulary()
    # load pre-trained w2v model
    w2v = Word2Vec.load(os.path.join(data_dir, 'w2v.model'))
    # build embedding matrix
    print('Building embedding matrix')
    embedding_matrix = build_embedding_matrix(vocab, w2v)
    print('embedding_matrix.shape => {}'.format(embedding_matrix.shape))
    print('Building model')

    model = Sequential()
    model.add(
        Embedding(input_dim=len(vocab) + 2,
                  output_dim=config['embedding_dim'],
                  embeddings_initializer=keras.initializers.Constant(
                      embedding_matrix),
                  trainable=False,
                  name='embedding_layer'))
    # add hidden layer with activation, L2 regularization, and dropout
    model.add(
        LSTM(32,
             activation=sys.argv[2],
             kernel_regularizer=l2(0.0001),
             dropout=0.1,
             return_sequences=False,
             name='hidden_layer'))
    # last layer with activation
    model.add(Dense(2, activation='softmax', name='output_layer'))
    model.summary()

    print('train the model')
    # train the model
    # convert words to indices, put them in arrays
    num_classes = 2
    x_train = vectorizer(np.array([[w] for w in x_train])).numpy()
    x_val = vectorizer(np.array([[w] for w in x_val])).numpy()
    y_train = np.array(y_train)
    y_val = np.array(y_val)
    # convert labels to binary class
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(x_train,
              y_train,
              batch_size=config['batch_size'],
              epochs=12,
              validation_data=(x_val, y_val))

    model.save(data_dir + 'nn_' + sys.argv[2] + '.model')

    score = model.evaluate(x_val, y_val)
    print("Accuracy: {0: .2f}%".format((score[1] * 100)))