Esempio n. 1
0
def get_model(embedding_dimension, essay_length):
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(
        glove_directory=GLOVE_DIR, embedding_dimension=embedding_dimension)

    model = Sequential()

    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=False,
                  mask_zero=False))
    model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop',
                  metrics=['mae'])
    model.summary()

    return model
Esempio n. 2
0
def get_model(embedding_dimension, essay_length):
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(
        glove_directory=GLOVE_DIR, embedding_dimension=embedding_dimension)

    model = Sequential()

    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=False,
                  mask_zero=False))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='mean_squared_error',
                  optimizer='rmsprop',
                  metrics=['mae'])
    model.summary()

    return model
Esempio n. 3
0
def get_model(embedding_dimension, essay_length):
    """
    Returns compiled model.
    """
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(GLOVE_DIR, embedding_dimension)

    model = Sequential()
    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=False,
                  mask_zero=False))
    # model.add(Conv1D(filters=50, kernel_size=5, padding='same'))
    # model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))
    # model.add(Dropout(0.4))
    # model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(Conv1D(filters=50, kernel_size=5, padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(
        Dense(1,
              activation='sigmoid',
              activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mean_squared_error', optimizer='adam')

    return model
Esempio n. 4
0
def train(model_name):
    embedding_matrix = utils.load_embedding_matrix()

    model = construct_model(model_name, seq_length, 8, embedding_matrix)

    model.summary()
    plot_model(model,
               to_file=model_path + model_name + '.png',
               show_shapes=False)

    x_train, y_train, x_test, y_test, label_names = utils.load_sina_news()

    tb = keras.callbacks.TensorBoard(log_dir='./logs',
                                     histogram_freq=0,
                                     write_graph=True,
                                     write_images=True)
    ckpt = keras.callbacks.ModelCheckpoint(model_path + model_name + '.h5',
                                           monitor='val_loss',
                                           mode='min',
                                           verbose=1,
                                           save_best_only=True,
                                           period=1)

    history = model.fit(x_train,
                        y_train,
                        batch_size=256,
                        epochs=100,
                        validation_split=0.16,
                        callbacks=[tb, ckpt],
                        verbose=2)
Esempio n. 5
0
def get_model(embedding_dimension, essay_length):
    """
    Returns compiled model.
    """
    vocabulary_size = len(tokenizer.word_index) + 1
    embedding_matrix = load_embedding_matrix(
        glove_directory=GLOVE_DIR, embedding_dimension=embedding_dimension)
    model = Sequential()
    model.add(
        Embedding(vocabulary_size,
                  embedding_dimension,
                  weights=[embedding_matrix],
                  input_length=essay_length,
                  trainable=True,
                  mask_zero=True))
    model.add(
        Conv1DWithMasking(nb_filter=64,
                          filter_length=3,
                          border_mode='same',
                          subsample_length=1))
    model.add(
        LSTM(128, dropout=0.4, recurrent_dropout=0.4, return_sequences=False))
    model.add(Dropout(0.4))
    model.add(Lambda(lambda x: K.mean(x, axis=1, keepdims=True)))
    model.add(
        Dense(1,
              activation='sigmoid',
              activity_regularizer=keras.regularizers.l2(0.0)))
    model.compile(loss='mse', optimizer='rmsprop')

    return model
Esempio n. 6
0
    def predict(self, load_best_model=True, evaluate=False):
        embedding_matrix = load_embedding_matrix(self.config.matrix_path)
        char_embedding_matrix = load_embedding_matrix(
            self.config.char_matrix_path)
        _, validationset, testa = load_data(self.config.data_path)
        validationset = [
            validationset[0], validationset[1][self.arrangement_index]
        ]

        model = self.cls(embedding_matrix=embedding_matrix,
                         char_embedding_matrix=char_embedding_matrix,
                         max_len=self.config.max_len,
                         max_char_len=self.config.max_char_len,
                         category_num=self.config.category_num,
                         dropout=self.config.dropout,
                         optimizer=self.config.optimizer,
                         arrangement_index=self.arrangement_index,
                         loss=self.config.loss,
                         metrics=self.config.metrics,
                         need_char_level=self.config.need_char_level,
                         need_summary=self.config.need_summary,
                         vector_trainable=self.config.vector_trainable)

        path = './models/save_model_' + model.name + '/' + self.config.arrangement
        if self.config.model_name is not None:
            model.load_weights(os.path.join(path, self.config.model_name))
        elif load_best_model is True:
            final_name = find_best_model(path, monitor=self.config.monitor)
            print('模型全名为:%s' % final_name)
            model_path = os.path.join(path, final_name)
            model.load_weights(model_path)
        else:
            raise ValueError('需要指定模型路径,或者有已经训练好的模型')

        test_data = testa[0] if evaluate is False else validationset[0]
        model.predict_loss = False
        output_array = model.predict(test_data=test_data,
                                     pre_batch_size=self.config.pre_batch_size,
                                     verbose=self.config.verbose)
        if evaluate:
            return output_array, validationset[1]
        return output_array
Esempio n. 7
0
    def train(self):
        embedding_matrix = load_embedding_matrix(self.config.matrix_path)
        if self.config.need_char_level:
            char_embedding_matrix = load_embedding_matrix(
                self.config.char_matrix_path)
        else:
            char_embedding_matrix = None
        trainingset, validationset, _ = load_data(self.config.data_path)

        model = self.cls(embedding_matrix=embedding_matrix,
                         char_embedding_matrix=char_embedding_matrix,
                         max_len=self.config.max_len,
                         max_char_len=self.config.max_char_len,
                         category_num=self.config.category_num,
                         dropout=self.config.dropout,
                         optimizer=self.config.optimizer,
                         arrangement_index=self.arrangement_index,
                         loss=self.config.loss,
                         metrics=self.config.metrics,
                         need_char_level=self.config.need_char_level,
                         need_summary=self.config.need_summary,
                         vector_trainable=self.config.vector_trainable)

        validationset = [[
            validationset[0][0], validationset[0][self.arrangement_index + 1]
        ], validationset[1][self.arrangement_index]]
        model.train_model(
            [trainingset[0][0], trainingset[0][self.arrangement_index + 1]],
            trainingset[1][self.arrangement_index],
            arrangement=self.config.arrangement,
            batch_size=self.config.batch_size,
            valid_batch_size=self.config.valid_batch_size,
            epochs=self.config.epochs,
            verbose=self.config.verbose,
            validation_data=validationset,
            monitor=self.config.monitor,
            load_model_name=self.config.model_name)
Esempio n. 8
0
BATCH_SIZE = 64
EPOCHS = 10
VERBOSE = True

# Load data
train, valid, test, dic = utils.load_data(DATA_FLD)
x_train, y_train = train
x_valid, y_valid = valid
x_test, y_test = test
word2idx, idx2word, label2idx, idx2label = dic

MAX_LEN = max([len(x) for x in x_train])
NUM_LABEL = len(label2idx)

# Load embedding
embeddings = utils.load_embedding_matrix(GLOVE_PATH, word2idx, EMBEDDING_DIM,
                                         MAX_NUM_WORDS)
# embeddings = load_my_emb(word2idx, EMBEDDING_DIM, MAX_NUM_WORDS)
num_words = min(MAX_NUM_WORDS, len(word2idx) + 1)

# Construct network
word_ids = Input(batch_shape=(None, None), dtype='int32')
lengths = Input(batch_shape=(None, None), dtype='int32')
inputs = [word_ids, lengths]
# inputs = [word_ids]

embedding_layer = Embedding(input_dim=embeddings.shape[0],
                            output_dim=embeddings.shape[1],
                            mask_zero=True,
                            weights=[embeddings])(word_ids)
# embedding_layer = Dropout(DROPOUT)(embedding_layer)
z = Bidirectional(LSTM(units=WORD_LSTM_SIZE,
DROPOUT = 0.2
HIDDEN_RNN_UNITS = 192
HIDDEN_DENSE_UNITS = 2048
LEARNING_RATE = 0.001
EPOCHS = 100
BATCH_SIZE = 64

## Load Datasets
train_x1, train_x2, train_features, train_y, valid_x1, valid_x2, valid_y, valid_features = loadDataset(
)
print('Dataset Loaded')

start_time = time.time()

## Load Embedding Matrix
(embedding_matrix, vocab_size) = load_embedding_matrix()


## Define Model
def build_model():
    input_1 = Input(shape=(MAX_LENGTH, ))
    input_2 = Input(shape=(MAX_LENGTH, ))

    e = Embedding(vocab_size,
                  300,
                  weights=[embedding_matrix],
                  input_length=MAX_LENGTH,
                  trainable=False)
    encoded1 = e(input_1)
    encoded2 = e(input_2)
Esempio n. 10
0
def run():
    df = pd.read_csv(config.INPUT_FILE)

    if config.TRAIN_PROMPT:
        df = df[['prompt', 'essay', config.TRAIN_FOR]]
    else:
        df = df[['essay', config.TRAIN_FOR]]

    df['essay_cleaned'] = df['essay'].apply(utils.replace_label)

    tokenizer = Tokenizer(num_words=config.VOCAB_SIZE)
    if config.TRAIN_PROMPT:
        tokenizer.fit_on_texts(df['prompt'])
    tokenizer.fit_on_texts(df['essay_cleaned'])

    X = utils.preprocess(df['essay_cleaned'], tokenizer, config.MAX_LEN)
    if config.TRAIN_PROMPT:
        X_prompt = utils.preprocess(df['prompt'], tokenizer,
                                    config.MAX_LEN_PROMPT)

    y = df[config.TRAIN_FOR].values

    # Uncomment if getting "DNN implementation Not Found" Error
    # physical_devices = tf.config.list_physical_devices('GPU')
    # tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

    embeddings = utils.load_embedding_matrix(tokenizer, config.GLOVE_PATH)

    if config.TRAIN_PROMPT:
        model = utils.get_model_prompt()
    else:
        model = utils.get_model(embeddings)

    model.compile(loss='mse', optimizer='adam', metrics=['mae'])

    mcp_save = ModelCheckpoint(
        filepath=
        f'../models/model-PROMPT_{config.TRAIN_PROMPT}_{config.TRAIN_FOR}_epochs_{config.EPOCHS}_{datetime.now()}.h5',
        save_best_only=True,
        monitor='val_mae',
        mode='min',
        verbose=1)

    earlyStopping = EarlyStopping(monitor='val_loss',
                                  patience=10,
                                  verbose=1,
                                  mode='min')

    if config.TRAIN_PROMPT:
        history = model.fit([X_prompt, X],
                            y,
                            batch_size=config.BATCH_SIZE,
                            epochs=config.EPOCHS,
                            validation_split=.2,
                            verbose=1,
                            callbacks=[mcp_save, earlyStopping])
    else:
        history = model.fit(X,
                            y,
                            batch_size=config.BATCH_SIZE,
                            epochs=config.EPOCHS,
                            validation_split=.3,
                            verbose=1,
                            shuffle=True,
                            callbacks=[mcp_save, earlyStopping])
    # print(model.summary())
    '''
    For saving pickle model
    with open(f'../models/model-TRAIN_PROMPT-{config.TRAIN_PROMPT}-\
    {config.TRAIN_FOR}-epochs-{config.EPOCHS}-\
    {datetime.now()}.pickle', 'wb') as handle:
        pickle.dump(history.history, handle)

    with open(f'../models/tokenizer_essays.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle)
    '''

    # Saving the model
    if config.TRAIN_PROMPT:
        MODEL_DIR = f"../models/prompt-essay/PROMPT_{config.TRAIN_FOR}"
    else:
        MODEL_DIR = f"../models/{config.TRAIN_FOR}"
    version = "1"
    export_path = os.path.join(MODEL_DIR, version)
    print('export_path = {}\n'.format(export_path))

    tf.keras.models.save_model(model,
                               export_path,
                               overwrite=True,
                               include_optimizer=True,
                               save_format=None,
                               signatures=None,
                               options=None)