def get_model(learning_rate=config.LEARNING_RATE, name='model_v1'):
    """
    create model.
    :return: model
    """
    num_class = 4
    embedding_matrix = load_embedding_matrix()

    inputs = tf.keras.layers.Input(shape=(config.MAX_SEQUENCE_LENGTH,), name='input')
    embedding = tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                                          embeddings_initializer=Constant(embedding_matrix),
                                          input_length=config.MAX_SEQUENCE_LENGTH,
                                          trainable=False)(inputs)
    share_hidden = tf.keras.layers.GRU(64, activation='relu', return_sequences=True, reset_after=True)(embedding)
    share_hidden = tf.keras.layers.GRU(32, activation='relu', reset_after=True)(share_hidden)

    outputs = []
    for col in constants.COLS:
        # outputs.append(fc_nn(share_hidden, name=col))
        outputs.append(tf.keras.layers.Dense(num_class, activation='softmax', name=col)(share_hidden))

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name=name)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='acc'),
                           tfa.metrics.F1Score(num_class, average='micro')],)

    return model
def train(epochs=config.EPOCHS, learning_rate=config.LEARNING_RATE):
    # service waiters attitude classification.
    _logger.info('load data.')
    x_train, y_train, x_val, y_val = train_data()

    _logger.info('load embedding matrix')
    embedding_matrix = load_embedding_matrix()

    _logger.info('get and compile model')
    model = get_model(embedding_matrix)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['acc'])

    log_dir = os.path.join(
        config.LOG_DIR,
        'fit/{}/{}'.format(model.name,
                           datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                          histogram_freq=1,
                                                          update_freq='batch')
    checkpoint_path = os.path.join(config.MODEL_CHECKPOINT_PATH,
                                   '{}'.format(model.name))
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                     save_weights_only=True,
                                                     verbose=1)

    _logger.info('training model')

    history = model.fit(x_train,
                        y_train,
                        batch_size=config.BATCH_SIZE,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(x_val, y_val),
                        callbacks=[tensorboard_callback, cp_callback],
                        workers=config.WORKER_NUM)

    _logger.info('save model')
    model_path = os.path.join(config.MODEL_PATH, model.name)
    # model.save(model_path)
    model.save_weights(model_path)

    _logger.info('done')

    return history
def init_embedding_matrix():
    # wv_model = KeyedVectors.load(tencent_pretrained_word_embedding)
    wv_model = Word2Vec.load(corpus_word_embdding)
    tokenizer = load_tokenizer()
    load_embedding_matrix(tokenizer.word_index, wv_model)
Exemple #4
0
# encoding: utf8

import env
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from senti_analysis import config
from senti_analysis.preprocess import load_embedding_matrix, load_tokenizer

tencent_pretrained_word_embedding = '/Users/hotbaby/Datasets/Tencent_AILab_ChineseEmbedding.txt'
corpus_word_embedding = '/Users/hotbaby/code/github/sentiment-analysis/notebooks/w2v.model'

if __name__ == '__main__':
    wv_model = KeyedVectors.load_word2vec_format(
        tencent_pretrained_word_embedding, binary=False)
    # wv_model = Word2Vec.load(corpus_word_embedding)

    wv_model.save(config.W2V_MODEL_PATH)

    tokenizer = load_tokenizer()
    load_embedding_matrix(tokenizer.word_index, wv_model)