def main(): start_time = time.perf_counter() # Loading the corpus if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_ud_en_ewt() else: # PTB if VILDE: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/') else: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos() # Convert the corpus in a dictionary conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform(dev_sentences) if MINI_CORPUS: train_dict = train_dict[:len(train_dict) // 10] test_dict = conll_dict.transform(test_sentences) # Extract the context and dictorize it context_dictorizer = ContextDictorizer() context_dictorizer.fit(train_dict) X_dict_train, y_cat_train = context_dictorizer.transform(train_dict) X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict) # Transform the X symbols into numbers dict_vectorizer = DictVectorizer() X_num_train = dict_vectorizer.fit_transform(X_dict_train) X_num_dev = dict_vectorizer.transform(X_dict_dev) scaler = None if SCALER: # Standardize X_num scaler = StandardScaler(with_mean=False) X = scaler.fit_transform(X_num_train) X_dev = scaler.transform(X_num_dev) else: X = X_num_train X_dev = X_num_dev # Vectorizing y # The POS and the number of different POS pos_list = sorted(set(y_cat_train)) NB_CLASSES = len(pos_list) # We build a part-of-speech index. idx2pos = dict(enumerate(pos_list)) pos2idx = {v: k for k, v in idx2pos.items()} # We encode y. We assign unknown parts of speech to 0 in the test set y = [pos2idx[i] for i in y_cat_train] y_dev = [pos2idx.get(i, 0) for i in y_cat_dev] # The tagger np.random.seed(0) model = build_model(X.shape[1], NB_CLASSES, num_layers=NUM_LAYERS, dropout=DROPOUT) # Callback to stop when the validation score does not increase # and keep the best model callback_lists = [ callbacks.EarlyStopping(monitor='val_acc', patience=2, restore_best_weights=True) ] # Fitting the model history = model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback_lists, validation_data=(X_dev, y_dev)) if SAVE_MODEL: model.save(config + '.h5') # Formatting the test set X_test_dict, y_test_cat = context_dictorizer.transform(test_dict) # We transform the symbols into numbers X_test_num = dict_vectorizer.transform(X_test_dict) if scaler: X_test = scaler.transform(X_test_num) else: X_test = X_test_num y_test = [pos2idx.get(i, 0) for i in y_test_cat] # Evaluate the model test_loss, test_acc = model.evaluate(X_test, y_test) print('Configuration', config) print('Loss:', test_loss) print('Accuracy:', test_acc) print('Time:', (time.perf_counter() - start_time) / 60) # Evaluation on the test set total = 0 correct = 0 print('#Sentences', len(test_dict)) for sentence in test_dict: y_pred = predict_sentence(sentence, model, context_dictorizer, dict_vectorizer, scaler, idx2pos) for y in y_pred: total += 1 if y['pos'] == y['ppos']: correct += 1 print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) # Tag some sentences sentences = [ 'That round table might collapse .', 'The man can learn well .', 'The man can swim .', 'The man can simwo .', 'that round table might collapsex' ] for sentence in sentences: sent_dict = sentence_to_conll(sentence.lower()) y_test_pred_cat = predict_sentence(sent_dict, model, context_dictorizer, dict_vectorizer, scaler, idx2pos) print([y['form'] for y in y_test_pred_cat]) print([y['ppos'] for y in y_test_pred_cat]) # Show the training curves loss = history.history['loss'] val_loss = history.history['val_loss'] acc = history.history['acc'] val_acc = history.history['val_acc'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.figure() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()
def main(): start_time = time.perf_counter() print('Starting:', config) # Loading the corpus if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_ud_en_ewt() else: # PTB if VILDE: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/') else: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos() conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform(dev_sentences) test_dict = conll_dict.transform(test_sentences) X_train_cat, Y_train_cat = build_sequences(train_dict) X_dev_cat, Y_dev_cat = build_sequences(dev_dict) print('First sentence, words', X_train_cat[0]) print('First sentence, POS', Y_train_cat[0]) # We collect the words, parts of speech and we create the indices vocabulary_words = sorted(set([word for sentence in X_train_cat for word in sentence])) print('#', len(vocabulary_words), 'words') # The embedding matrix, we collect the words in the file if VILDE: embeddings_dict = datasets.load_glove_vectors( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/') else: embeddings_dict = datasets.load_glove_vectors() embeddings_words = embeddings_dict.keys() print('Words in embedding file:', len(embeddings_dict.keys())) vocabulary_words = sorted(set(vocabulary_words + list(embeddings_words))) print('# unique words in the vocabulary: embeddings and corpus:', len(vocabulary_words)) pos = sorted(set([pos for sentence in Y_train_cat for pos in sentence])) NB_CLASSES = len(pos) print('#', NB_CLASSES, 'Parts of speech:', pos) # We create the indexes # We start at two to make provision for # the padding symbol 0 in RNN and LSTMs and unknown words, 1 idx2word = dict(enumerate(vocabulary_words, start=2)) idx2pos = dict(enumerate(pos, start=1)) word2idx = {v: k for k, v in idx2word.items()} pos2idx = {v: k for k, v in idx2pos.items()} print('word index:', list(word2idx.items())[:10]) print('POS index:', list(pos2idx.items())[:10]) # We create the parallel sequences of indexes X_train_idx = to_index(X_train_cat, word2idx) Y_train_idx = to_index(Y_train_cat, pos2idx) X_dev_idx = to_index(X_dev_cat, word2idx) Y_dev_idx = to_index(Y_dev_cat, pos2idx) print('First sentences, word indices', X_train_idx[:3]) print('First sentences, POS indices', Y_train_idx[:3]) X_train = pad_sequences(X_train_idx) Y_train = pad_sequences(Y_train_idx) X_dev = pad_sequences(X_dev_idx) Y_dev = pad_sequences(Y_dev_idx) print('Padded X:', X_train[0]) print('Padded Y:', Y_train[0]) # The number of POS classes, and 0 (padding symbol) Y_train = to_categorical(Y_train, num_classes=len(pos) + 1) Y_dev = to_categorical(Y_dev, num_classes=len(pos) + 1) print('Padded categorical Y:', Y_train[0]) np.random.seed(1234567) embedding_matrix = np.random.uniform(-0.05, 0.05, (len(vocabulary_words) + 2, EMBEDDING_DIM) ).astype(np.float32) # We initialize the matrix with embeddings for word in vocabulary_words: if word in embeddings_dict: # If the words are in the embeddings, # we fill them with a value embedding_matrix[word2idx[word]] = embeddings_dict[word] # print('Embedding:', embedding_matrix) print('Shape of embedding matrix:', embedding_matrix.shape) print('Embedding of table', embedding_matrix[word2idx['table']]) print('Embedding of the padding symbol, idx 0, random numbers', embedding_matrix[0]) if TYPE == 'RNN' and OUTPUT_LAYER == 'DENSE': model = build_model_rnn(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'LSTM' and OUTPUT_LAYER == 'DENSE': model = build_model_lstm(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'RNN' and OUTPUT_LAYER == 'CRF': model = build_model_rnn_crf(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'LSTM' and OUTPUT_LAYER == 'CRF': model = build_model_lstm_crf(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) model.summary() if OUTPUT_LAYER == 'DENSE': MONITOR = 'val_acc' else: MONITOR = 'val_crf_viterbi_accuracy' # Callback to stop when the validation score does not increase # and keep the best model callback_lists = [ callbacks.EarlyStopping( monitor=MONITOR, patience=PATIENCE, restore_best_weights=True ) ] # Fitting the model history = model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback_lists, validation_data=(X_dev, Y_dev)) # In X_dict, we replace the words with their index X_test_cat, Y_test_cat = build_sequences(test_dict) # We create the parallel sequences of indexes X_test_idx = to_index(X_test_cat, word2idx) Y_test_idx = to_index(Y_test_cat, pos2idx) print('X[0] test idx', X_test_idx[0]) print('Y[0] test idx', Y_test_idx[0]) X_test_padded = pad_sequences(X_test_idx) Y_test_padded = pad_sequences(Y_test_idx) print('X[0] test idx padded', X_test_padded[0]) print('Y[0] test idx padded', Y_test_padded[0]) # One extra symbol for 0 (padding) Y_test_padded_vectorized = to_categorical(Y_test_padded, num_classes=len(pos) + 1) print('Y[0] test idx padded vectorized', Y_test_padded_vectorized[0]) print(X_test_padded.shape) print(Y_test_padded_vectorized.shape) # Evaluates the model test_loss, test_acc = model.evaluate(X_test_padded, Y_test_padded_vectorized, batch_size=BATCH_SIZE) print('Configuration', config) print('Batch evaluation') print('Loss:', test_loss) print('Accuracy:', test_acc) print('Time:', (time.perf_counter() - start_time) / 60) print('Evaluation of padded sentences') Y_test_pred = predict_padded_testset(X_test_cat, model, word2idx, idx2pos) total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx) print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) if total_ukn != 0: print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn)) print('Evaluation of individual sentences') Y_test_pred = [predict_wordlist(x, model, word2idx, idx2pos) for x in X_test_cat] total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx) print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) if total_ukn != 0: print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn)) # Tagging a few sentences sentences = ['That round table might collapse .', 'The man can learn well .', 'The man can swim .', 'The man can simwo .', 'that round table might collapsex', ] for sentence in sentences: y_test_pred_cat = predict_sentence(sentence.lower(), model, word2idx, idx2pos) print(sentence) print(y_test_pred_cat) # Show the training curves loss = history.history['loss'] val_loss = history.history['val_loss'] if OUTPUT_LAYER == 'DENSE': acc = history.history['acc'] val_acc = history.history['val_acc'] elif OUTPUT_LAYER == 'CRF': acc = history.history['crf_viterbi_accuracy'] val_acc = history.history['val_crf_viterbi_accuracy'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.figure() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()
def main(): start_time = time.perf_counter() print('Starting:', config) # Loading the corpus if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_ud_en_ewt() else: # PTB if VILDE: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/') else: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos() conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform(dev_sentences) test_dict = conll_dict.transform(test_sentences) X_train_cat, Y_train_cat = build_sequences(train_dict) X_dev_cat, Y_dev_cat = build_sequences(dev_dict) print('First sentence, words', X_train_cat[0]) print('First sentence, POS', Y_train_cat[0]) # We collect the words, parts of speech and we create the indices vocabulary_words = sorted( set([word for sentence in X_train_cat for word in sentence])) print('#', len(vocabulary_words), 'words') # The embedding matrix, we collect the words in the file if VILDE: embeddings_dict = datasets.load_glove_vectors( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/') else: embeddings_dict = datasets.load_glove_vectors() embeddings_words = embeddings_dict.keys() print('Words in embedding file:', len(embeddings_dict.keys())) vocabulary_words = sorted(set(vocabulary_words + list(embeddings_words))) print('# unique words in the vocabulary: embeddings and corpus:', len(vocabulary_words)) pos = sorted(set([pos for sentence in Y_train_cat for pos in sentence])) NB_CLASSES = len(pos) print('#', NB_CLASSES, 'Parts of speech:', pos) # We create the indexes # We start at two to make provision for # the padding symbol 0 in RNN and LSTMs and unknown words, 1 idx2word = dict(enumerate(vocabulary_words, start=2)) idx2pos = dict(enumerate(pos, start=1)) word2idx = {v: k for k, v in idx2word.items()} pos2idx = {v: k for k, v in idx2pos.items()} print('word index:', list(word2idx.items())[:10]) print('POS index:', list(pos2idx.items())[:10]) # We create the parallel sequences of indexes X_train_idx = to_index(X_train_cat, word2idx) Y_train_idx = to_index(Y_train_cat, pos2idx) X_dev_idx = to_index(X_dev_cat, word2idx) Y_dev_idx = to_index(Y_dev_cat, pos2idx) print('First sentences, word indices', X_train_idx[:3]) print('First sentences, POS indices', Y_train_idx[:3]) X_train = pad_sequences(X_train_idx) Y_train = pad_sequences(Y_train_idx) X_dev = pad_sequences(X_dev_idx) Y_dev = pad_sequences(Y_dev_idx) print('Padded X:', X_train[0]) print('Padded Y:', Y_train[0]) # The number of POS classes, and 0 (padding symbol) Y_train = to_categorical(Y_train, num_classes=len(pos) + 1) Y_dev = to_categorical(Y_dev, num_classes=len(pos) + 1) print('Padded categorical Y:', Y_train[0]) np.random.seed(1234567) embedding_matrix = np.random.uniform( -0.05, 0.05, (len(vocabulary_words) + 2, EMBEDDING_DIM)).astype(np.float32) # We initialize the matrix with embeddings for word in vocabulary_words: if word in embeddings_dict: # If the words are in the embeddings, # we fill them with a value embedding_matrix[word2idx[word]] = embeddings_dict[word] # print('Embedding:', embedding_matrix) print('Shape of embedding matrix:', embedding_matrix.shape) print('Embedding of table', embedding_matrix[word2idx['table']]) print('Embedding of the padding symbol, idx 0, random numbers', embedding_matrix[0]) if TYPE == 'RNN' and OUTPUT_LAYER == 'DENSE': model = build_model_rnn(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'LSTM' and OUTPUT_LAYER == 'DENSE': model = build_model_lstm(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'RNN' and OUTPUT_LAYER == 'CRF': model = build_model_rnn_crf(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) elif TYPE == 'LSTM' and OUTPUT_LAYER == 'CRF': model = build_model_lstm_crf(vocabulary_words, embedding_matrix, EMBEDDING_DIM, NB_CLASSES, unit_multiplier=UNIT_MULTIPLIER, input_dropout=INPUT_DROPOUT, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT, ouptput_dropout=OUTPUT_DROPOUT, optimizer=OPTIMIZER) model.summary() if OUTPUT_LAYER == 'DENSE': MONITOR = 'val_acc' else: MONITOR = 'val_crf_viterbi_accuracy' # Callback to stop when the validation score does not increase # and keep the best model callback_lists = [ callbacks.EarlyStopping(monitor=MONITOR, patience=PATIENCE, restore_best_weights=True) ] # Fitting the model history = model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback_lists, validation_data=(X_dev, Y_dev)) # In X_dict, we replace the words with their index X_test_cat, Y_test_cat = build_sequences(test_dict) # We create the parallel sequences of indexes X_test_idx = to_index(X_test_cat, word2idx) Y_test_idx = to_index(Y_test_cat, pos2idx) print('X[0] test idx', X_test_idx[0]) print('Y[0] test idx', Y_test_idx[0]) X_test_padded = pad_sequences(X_test_idx) Y_test_padded = pad_sequences(Y_test_idx) print('X[0] test idx padded', X_test_padded[0]) print('Y[0] test idx padded', Y_test_padded[0]) # One extra symbol for 0 (padding) Y_test_padded_vectorized = to_categorical(Y_test_padded, num_classes=len(pos) + 1) print('Y[0] test idx padded vectorized', Y_test_padded_vectorized[0]) print(X_test_padded.shape) print(Y_test_padded_vectorized.shape) # Evaluates the model test_loss, test_acc = model.evaluate(X_test_padded, Y_test_padded_vectorized, batch_size=BATCH_SIZE) print('Configuration', config) print('Batch evaluation') print('Loss:', test_loss) print('Accuracy:', test_acc) print('Time:', (time.perf_counter() - start_time) / 60) print('Evaluation of padded sentences') Y_test_pred = predict_padded_testset(X_test_cat, model, word2idx, idx2pos) total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx) print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) if total_ukn != 0: print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn)) print('Evaluation of individual sentences') Y_test_pred = [ predict_wordlist(x, model, word2idx, idx2pos) for x in X_test_cat ] total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx) print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) if total_ukn != 0: print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn)) # Tagging a few sentences sentences = [ 'That round table might collapse .', 'The man can learn well .', 'The man can swim .', 'The man can simwo .', 'that round table might collapsex', ] for sentence in sentences: y_test_pred_cat = predict_sentence(sentence.lower(), model, word2idx, idx2pos) print(sentence) print(y_test_pred_cat) # Show the training curves loss = history.history['loss'] val_loss = history.history['val_loss'] if OUTPUT_LAYER == 'DENSE': acc = history.history['acc'] val_acc = history.history['val_acc'] elif OUTPUT_LAYER == 'CRF': acc = history.history['crf_viterbi_accuracy'] val_acc = history.history['val_crf_viterbi_accuracy'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.figure() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()
conll_cols = '' for tuple in sentence: conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n' conll_dict = CoNLLDictorizer(column_names) sent_dict = conll_dict.transform(conll_cols) return sent_dict[0] if __name__ == '__main__': start_time = time.clock() if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt( ) else: train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos( ) conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) print(train_dict[0]) context_dictorizer = ContextDictorizer() context_dictorizer.fit(train_dict) # Feature and response extraction X_dict, y = context_dictorizer.transform(train_dict) print(X_dict[0]) print(y[0]) # We print the features to check they match Table 8.1 in my book (second edition) # We use the training step extraction with the dynamic features context_dictorizer.print_example(train_dict)
def main(): # Loading the embeddings if VILDE: embeddings_dict = datasets.load_glove_vectors( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/') else: embeddings_dict = datasets.load_glove_vectors() print('Embeddings table:', embeddings_dict['table']) # Loading the corpus if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_ud_en_ewt() else: # PTB if VILDE: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/') else: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos() conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform(dev_sentences) if MINI_CORPUS: train_dict = train_dict[:len(train_dict) // 15] test_dict = conll_dict.transform(test_sentences) print('First sentence, train:', train_dict[0]) context_dictorizer = ContextDictorizer() context_dictorizer.fit(train_dict) X_dict_train, y_cat_train = context_dictorizer.transform(train_dict) X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict) corpus_words = [value for x in X_dict_train for value in x.values()] corpus_words = sorted(set(corpus_words)) print('# unique words seen in training corpus:', len(corpus_words)) embeddings_words = embeddings_dict.keys() print('Words in GloVe:', len(embeddings_dict.keys())) vocabulary_words = set(corpus_words + list(embeddings_words)) # We start at 1, 0 is the unknown word idx2word = dict(enumerate(vocabulary_words, start=1)) word2idx = {v: k for k, v in idx2word.items()} cnt_uniq = len(vocabulary_words) + 1 print('# unique words in the vocabulary: embeddings and corpus:', len(vocabulary_words) + 1) for x_dict in X_dict_train: for word in x_dict: x_dict[word] = word2idx[x_dict[word]] for x_dict in X_dict_dev: for word in x_dict: x_dict[word] = word2idx.get(x_dict[word], 0) np.random.seed(0) dict_vectorizer = DictVectorizer(sparse=False) X = dict_vectorizer.fit_transform(X_dict_train) X_dev = dict_vectorizer.transform(X_dict_dev) print('X shape', X.shape) print('First line of X:', X[0]) # The POS and the number of different POS pos_list = sorted(set(y_cat_train)) NB_CLASSES = len(pos_list) # We build a part-of-speech index. idx2pos = dict(enumerate(pos_list)) pos2idx = {v: k for k, v in idx2pos.items()} # We encode y y = [pos2idx[i] for i in y_cat_train] print(y_cat_train[:10]) y_dev = [pos2idx[i] for i in y_cat_dev] embedding_matrix = np.random.random((cnt_uniq, EMBEDDING_DIM)) # Same init as with Keras (-0.05, 0.05) # embedding_matrix = (embedding_matrix - 0.5) / 10.0 for word in vocabulary_words: if word in embeddings_dict: # If the words are in the pretrained embeddings, # we fill them with this embedding value embedding_matrix[word2idx[word]] = embeddings_dict[word] model = models.Sequential() model.add( layers.Embedding(cnt_uniq, EMBEDDING_DIM, weights=[embedding_matrix], trainable=True, input_length=2 * W_SIZE + 1)) model.add(layers.Flatten()) model.add(layers.Dense(NB_CLASSES, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy']) model.summary() # Fitting the model callback_lists = [ callbacks.EarlyStopping(monitor='val_acc', patience=2, restore_best_weights=True) ] # Callback to stop when the validation score does not increase # and keep the best model history = model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback_lists, validation_data=(X_dev, y_dev)) X_test_dict, y_test_cat = context_dictorizer.transform(test_dict) for x_dict_test in X_test_dict: for word in x_dict_test: x_dict_test[word] = word2idx.get(x_dict_test[word], 0) # We transform the symbols into numbers X_test = dict_vectorizer.transform(X_test_dict) y_test = [pos2idx.get(i, 0) for i in y_test_cat] test_loss, test_acc = model.evaluate(X_test, y_test) print('Configuration:', config) print('Loss:', test_loss) print('Accuracy:', test_acc) # Evaluation on the test set total = 0 correct = 0 print('#Sentences', len(test_dict)) for sentence in test_dict: y_pred = predict_sentence(sentence, model, context_dictorizer, dict_vectorizer, word2idx, idx2pos) for y in y_pred: total += 1 if y['pos'] == y['ppos']: correct += 1 print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) # Tag some sentences sentences = [ 'That round table might collapse .', 'The man can learn well .', 'The man can swim .', 'The man can simwo .', 'that round table might collapsex' ] for sentence in sentences: sent_dict = sentence_to_conll(sentence.lower()) sent_dict = predict_sentence(sent_dict, model, context_dictorizer, dict_vectorizer, word2idx, idx2pos) print([word['form'] for word in sent_dict]) print([word['ppos'] for word in sent_dict]) # Show the training curves loss = history.history['loss'] val_loss = history.history['val_loss'] acc = history.history['acc'] val_acc = history.history['val_acc'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.figure() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()
sentence = list(enumerate(sentence.split(), start=1)) conll_cols = '' for tuple in sentence: conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n' conll_dict = CoNLLDictorizer(column_names) sent_dict = conll_dict.transform(conll_cols) return sent_dict[0] if __name__ == '__main__': start_time = time.clock() if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt() else: train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos() conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) print(train_dict[0]) context_dictorizer = ContextDictorizer() context_dictorizer.fit(train_dict) # Feature and response extraction X_dict, y = context_dictorizer.transform(train_dict) print(X_dict[0]) print(y[0]) # We print the features to check they match Table 8.1 in my book (second edition) # We use the training step extraction with the dynamic features context_dictorizer.print_example(train_dict)
def main(): start_time = time.perf_counter() # Loading the corpus if CORPUS == 'EWT': train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_ud_en_ewt() else: # PTB if VILDE: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos( BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/') else: train_sentences, dev_sentences, test_sentences, column_names = \ datasets.load_conll2009_pos() # Convert the corpus in a dictionary conll_dict = CoNLLDictorizer(column_names) train_dict = conll_dict.transform(train_sentences) dev_dict = conll_dict.transform(dev_sentences) if MINI_CORPUS: train_dict = train_dict[:len(train_dict) // 10] test_dict = conll_dict.transform(test_sentences) # Extract the context and dictorize it context_dictorizer = ContextDictorizer() context_dictorizer.fit(train_dict) X_dict_train, y_cat_train = context_dictorizer.transform(train_dict) X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict) # Transform the X symbols into numbers dict_vectorizer = DictVectorizer() X_num_train = dict_vectorizer.fit_transform(X_dict_train) X_num_dev = dict_vectorizer.transform(X_dict_dev) scaler = None if SCALER: # Standardize X_num scaler = StandardScaler(with_mean=False) X = scaler.fit_transform(X_num_train) X_dev = scaler.transform(X_num_dev) else: X = X_num_train X_dev = X_num_dev # Vectorizing y # The POS and the number of different POS pos_list = sorted(set(y_cat_train)) NB_CLASSES = len(pos_list) # We build a part-of-speech index. idx2pos = dict(enumerate(pos_list)) pos2idx = {v: k for k, v in idx2pos.items()} # We encode y. We assign unknown parts of speech to 0 in the test set y = [pos2idx[i] for i in y_cat_train] y_dev = [pos2idx.get(i, 0) for i in y_cat_dev] # The tagger np.random.seed(0) model = build_model(X.shape[1], NB_CLASSES, num_layers=NUM_LAYERS, dropout=DROPOUT) # Callback to stop when the validation score does not increase # and keep the best model callback_lists = [ callbacks.EarlyStopping( monitor='val_acc', patience=2, restore_best_weights=True ) ] # Fitting the model history = model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback_lists, validation_data=(X_dev, y_dev)) if SAVE_MODEL: model.save(config + '.h5') # Formatting the test set X_test_dict, y_test_cat = context_dictorizer.transform(test_dict) # We transform the symbols into numbers X_test_num = dict_vectorizer.transform(X_test_dict) if scaler: X_test = scaler.transform(X_test_num) else: X_test = X_test_num y_test = [pos2idx.get(i, 0) for i in y_test_cat] # Evaluate the model test_loss, test_acc = model.evaluate(X_test, y_test) print('Configuration', config) print('Loss:', test_loss) print('Accuracy:', test_acc) print('Time:', (time.perf_counter() - start_time) / 60) # Evaluation on the test set total = 0 correct = 0 print('#Sentences', len(test_dict)) for sentence in test_dict: y_pred = predict_sentence(sentence, model, context_dictorizer, dict_vectorizer, scaler, idx2pos) for y in y_pred: total += 1 if y['pos'] == y['ppos']: correct += 1 print('total %d, correct %d, accuracy %f' % (total, correct, correct / total)) # Tag some sentences sentences = ['That round table might collapse .', 'The man can learn well .', 'The man can swim .', 'The man can simwo .', 'that round table might collapsex'] for sentence in sentences: sent_dict = sentence_to_conll(sentence.lower()) y_test_pred_cat = predict_sentence(sent_dict, model, context_dictorizer, dict_vectorizer, scaler, idx2pos) print([y['form'] for y in y_test_pred_cat]) print([y['ppos'] for y in y_test_pred_cat]) # Show the training curves loss = history.history['loss'] val_loss = history.history['val_loss'] acc = history.history['acc'] val_acc = history.history['val_acc'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.figure() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()