Esempio n. 1
0
def main():
    start_time = time.perf_counter()

    # Loading the corpus
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = \
            datasets.load_ud_en_ewt()
    else:  # PTB
        if VILDE:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos(
                    BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/')
        else:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos()

    # Convert the corpus in a dictionary
    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(dev_sentences)
    if MINI_CORPUS:
        train_dict = train_dict[:len(train_dict) // 10]
    test_dict = conll_dict.transform(test_sentences)

    # Extract the context and dictorize it
    context_dictorizer = ContextDictorizer()
    context_dictorizer.fit(train_dict)
    X_dict_train, y_cat_train = context_dictorizer.transform(train_dict)
    X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict)

    # Transform the X symbols into numbers
    dict_vectorizer = DictVectorizer()
    X_num_train = dict_vectorizer.fit_transform(X_dict_train)
    X_num_dev = dict_vectorizer.transform(X_dict_dev)

    scaler = None
    if SCALER:
        # Standardize X_num
        scaler = StandardScaler(with_mean=False)
        X = scaler.fit_transform(X_num_train)
        X_dev = scaler.transform(X_num_dev)
    else:
        X = X_num_train
        X_dev = X_num_dev

    # Vectorizing y
    # The POS and the number of different POS
    pos_list = sorted(set(y_cat_train))
    NB_CLASSES = len(pos_list)

    # We build a part-of-speech index.
    idx2pos = dict(enumerate(pos_list))
    pos2idx = {v: k for k, v in idx2pos.items()}

    # We encode y. We assign unknown parts of speech to 0 in the test set
    y = [pos2idx[i] for i in y_cat_train]
    y_dev = [pos2idx.get(i, 0) for i in y_cat_dev]

    # The tagger
    np.random.seed(0)
    model = build_model(X.shape[1],
                        NB_CLASSES,
                        num_layers=NUM_LAYERS,
                        dropout=DROPOUT)

    # Callback to stop when the validation score does not increase
    # and keep the best model
    callback_lists = [
        callbacks.EarlyStopping(monitor='val_acc',
                                patience=2,
                                restore_best_weights=True)
    ]
    # Fitting the model
    history = model.fit(X,
                        y,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=callback_lists,
                        validation_data=(X_dev, y_dev))
    if SAVE_MODEL:
        model.save(config + '.h5')

    # Formatting the test set
    X_test_dict, y_test_cat = context_dictorizer.transform(test_dict)

    # We transform the symbols into numbers
    X_test_num = dict_vectorizer.transform(X_test_dict)
    if scaler:
        X_test = scaler.transform(X_test_num)
    else:
        X_test = X_test_num
    y_test = [pos2idx.get(i, 0) for i in y_test_cat]

    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test)

    print('Configuration', config)
    print('Loss:', test_loss)
    print('Accuracy:', test_acc)
    print('Time:', (time.perf_counter() - start_time) / 60)

    # Evaluation on the test set
    total = 0
    correct = 0
    print('#Sentences', len(test_dict))
    for sentence in test_dict:
        y_pred = predict_sentence(sentence, model, context_dictorizer,
                                  dict_vectorizer, scaler, idx2pos)
        for y in y_pred:
            total += 1
            if y['pos'] == y['ppos']:
                correct += 1
    print('total %d, correct %d, accuracy %f' %
          (total, correct, correct / total))

    # Tag some sentences
    sentences = [
        'That round table might collapse .', 'The man can learn well .',
        'The man can swim .', 'The man can simwo .',
        'that round table might collapsex'
    ]
    for sentence in sentences:
        sent_dict = sentence_to_conll(sentence.lower())
        y_test_pred_cat = predict_sentence(sent_dict, model,
                                           context_dictorizer, dict_vectorizer,
                                           scaler, idx2pos)
        print([y['form'] for y in y_test_pred_cat])
        print([y['ppos'] for y in y_test_pred_cat])

    # Show the training curves
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
Esempio n. 2
0
def main():
    start_time = time.perf_counter()
    print('Starting:', config)

    # Loading the corpus
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = \
            datasets.load_ud_en_ewt()
    else:  # PTB
        if VILDE:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos(
                    BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/')
        else:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos()

    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(dev_sentences)
    test_dict = conll_dict.transform(test_sentences)

    X_train_cat, Y_train_cat = build_sequences(train_dict)
    X_dev_cat, Y_dev_cat = build_sequences(dev_dict)
    print('First sentence, words', X_train_cat[0])
    print('First sentence, POS', Y_train_cat[0])

    # We collect the words, parts of speech and we create the indices
    vocabulary_words = sorted(set([word
                                   for sentence in X_train_cat
                                   for word in sentence]))
    print('#', len(vocabulary_words), 'words')

    # The embedding matrix, we collect the words in the file
    if VILDE:
        embeddings_dict = datasets.load_glove_vectors(
            BASE_DIR='/home/pierre/Cours/EDAN20/corpus/')
    else:
        embeddings_dict = datasets.load_glove_vectors()
    embeddings_words = embeddings_dict.keys()
    print('Words in embedding file:', len(embeddings_dict.keys()))
    vocabulary_words = sorted(set(vocabulary_words +
                                  list(embeddings_words)))
    print('# unique words in the vocabulary: embeddings and corpus:',
          len(vocabulary_words))

    pos = sorted(set([pos
                      for sentence in Y_train_cat
                      for pos in sentence]))
    NB_CLASSES = len(pos)
    print('#', NB_CLASSES, 'Parts of speech:', pos)

    # We create the indexes
    # We start at two to make provision for
    # the padding symbol 0 in RNN and LSTMs and unknown words, 1
    idx2word = dict(enumerate(vocabulary_words, start=2))
    idx2pos = dict(enumerate(pos, start=1))
    word2idx = {v: k for k, v in idx2word.items()}
    pos2idx = {v: k for k, v in idx2pos.items()}
    print('word index:', list(word2idx.items())[:10])
    print('POS index:', list(pos2idx.items())[:10])

    # We create the parallel sequences of indexes
    X_train_idx = to_index(X_train_cat, word2idx)
    Y_train_idx = to_index(Y_train_cat, pos2idx)
    X_dev_idx = to_index(X_dev_cat, word2idx)
    Y_dev_idx = to_index(Y_dev_cat, pos2idx)
    print('First sentences, word indices', X_train_idx[:3])
    print('First sentences, POS indices', Y_train_idx[:3])

    X_train = pad_sequences(X_train_idx)
    Y_train = pad_sequences(Y_train_idx)
    X_dev = pad_sequences(X_dev_idx)
    Y_dev = pad_sequences(Y_dev_idx)
    print('Padded X:', X_train[0])
    print('Padded Y:', Y_train[0])

    # The number of POS classes, and 0 (padding symbol)
    Y_train = to_categorical(Y_train, num_classes=len(pos) + 1)
    Y_dev = to_categorical(Y_dev, num_classes=len(pos) + 1)
    print('Padded categorical Y:', Y_train[0])

    np.random.seed(1234567)
    embedding_matrix = np.random.uniform(-0.05, 0.05,
                                         (len(vocabulary_words) + 2,
                                          EMBEDDING_DIM)
                                         ).astype(np.float32)
    # We initialize the matrix with embeddings
    for word in vocabulary_words:
        if word in embeddings_dict:
            # If the words are in the embeddings,
            # we fill them with a value
            embedding_matrix[word2idx[word]] = embeddings_dict[word]
    # print('Embedding:', embedding_matrix)
    print('Shape of embedding matrix:', embedding_matrix.shape)
    print('Embedding of table', embedding_matrix[word2idx['table']])
    print('Embedding of the padding symbol, idx 0, random numbers', embedding_matrix[0])

    if TYPE == 'RNN' and OUTPUT_LAYER == 'DENSE':
        model = build_model_rnn(vocabulary_words,
                                embedding_matrix,
                                EMBEDDING_DIM,
                                NB_CLASSES,
                                unit_multiplier=UNIT_MULTIPLIER,
                                input_dropout=INPUT_DROPOUT,
                                dropout=DROPOUT,
                                recurrent_dropout=RECURRENT_DROPOUT,
                                ouptput_dropout=OUTPUT_DROPOUT,
                                optimizer=OPTIMIZER)
    elif TYPE == 'LSTM' and OUTPUT_LAYER == 'DENSE':
        model = build_model_lstm(vocabulary_words,
                                 embedding_matrix,
                                 EMBEDDING_DIM,
                                 NB_CLASSES,
                                 unit_multiplier=UNIT_MULTIPLIER,
                                 input_dropout=INPUT_DROPOUT,
                                 dropout=DROPOUT,
                                 recurrent_dropout=RECURRENT_DROPOUT,
                                 ouptput_dropout=OUTPUT_DROPOUT,
                                 optimizer=OPTIMIZER)
    elif TYPE == 'RNN' and OUTPUT_LAYER == 'CRF':
        model = build_model_rnn_crf(vocabulary_words,
                                    embedding_matrix,
                                    EMBEDDING_DIM,
                                    NB_CLASSES,
                                    unit_multiplier=UNIT_MULTIPLIER,
                                    input_dropout=INPUT_DROPOUT,
                                    dropout=DROPOUT,
                                    recurrent_dropout=RECURRENT_DROPOUT,
                                    ouptput_dropout=OUTPUT_DROPOUT,
                                    optimizer=OPTIMIZER)
    elif TYPE == 'LSTM' and OUTPUT_LAYER == 'CRF':
        model = build_model_lstm_crf(vocabulary_words,
                                     embedding_matrix,
                                     EMBEDDING_DIM,
                                     NB_CLASSES,
                                     unit_multiplier=UNIT_MULTIPLIER,
                                     input_dropout=INPUT_DROPOUT,
                                     dropout=DROPOUT,
                                     recurrent_dropout=RECURRENT_DROPOUT,
                                     ouptput_dropout=OUTPUT_DROPOUT,
                                     optimizer=OPTIMIZER)
    model.summary()

    if OUTPUT_LAYER == 'DENSE':
        MONITOR = 'val_acc'
    else:
        MONITOR = 'val_crf_viterbi_accuracy'

    # Callback to stop when the validation score does not increase
    # and keep the best model
    callback_lists = [
        callbacks.EarlyStopping(
            monitor=MONITOR,
            patience=PATIENCE,
            restore_best_weights=True
        )
    ]
    # Fitting the model
    history = model.fit(X_train, Y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=callback_lists,
                        validation_data=(X_dev, Y_dev))

    # In X_dict, we replace the words with their index
    X_test_cat, Y_test_cat = build_sequences(test_dict)

    # We create the parallel sequences of indexes
    X_test_idx = to_index(X_test_cat, word2idx)
    Y_test_idx = to_index(Y_test_cat, pos2idx)
    print('X[0] test idx', X_test_idx[0])
    print('Y[0] test idx', Y_test_idx[0])

    X_test_padded = pad_sequences(X_test_idx)
    Y_test_padded = pad_sequences(Y_test_idx)
    print('X[0] test idx padded', X_test_padded[0])
    print('Y[0] test idx padded', Y_test_padded[0])

    # One extra symbol for 0 (padding)
    Y_test_padded_vectorized = to_categorical(Y_test_padded,
                                              num_classes=len(pos) + 1)
    print('Y[0] test idx padded vectorized', Y_test_padded_vectorized[0])

    print(X_test_padded.shape)
    print(Y_test_padded_vectorized.shape)

    # Evaluates the model
    test_loss, test_acc = model.evaluate(X_test_padded,
                                         Y_test_padded_vectorized,
                                         batch_size=BATCH_SIZE)
    print('Configuration', config)
    print('Batch evaluation')
    print('Loss:', test_loss)
    print('Accuracy:', test_acc)
    print('Time:', (time.perf_counter() - start_time) / 60)

    print('Evaluation of padded sentences')
    Y_test_pred = predict_padded_testset(X_test_cat, model, word2idx, idx2pos)
    total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx)
    print('total %d, correct %d, accuracy %f' % (total, correct, correct / total))
    if total_ukn != 0:
        print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn))

    print('Evaluation of individual sentences')
    Y_test_pred = [predict_wordlist(x, model, word2idx, idx2pos)
                   for x in X_test_cat]
    total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat, Y_test_pred, word2idx)
    print('total %d, correct %d, accuracy %f' % (total, correct, correct / total))
    if total_ukn != 0:
        print('total unknown %d, correct %d, accuracy %f' % (total_ukn, correct_ukn, correct_ukn / total_ukn))

    # Tagging a few sentences
    sentences = ['That round table might collapse .',
                 'The man can learn well .',
                 'The man can swim .',
                 'The man can simwo .',
                 'that round table might collapsex', ]
    for sentence in sentences:
        y_test_pred_cat = predict_sentence(sentence.lower(), model, word2idx, idx2pos)
        print(sentence)
        print(y_test_pred_cat)

    # Show the training curves
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    if OUTPUT_LAYER == 'DENSE':
        acc = history.history['acc']
        val_acc = history.history['val_acc']
    elif OUTPUT_LAYER == 'CRF':
        acc = history.history['crf_viterbi_accuracy']
        val_acc = history.history['val_crf_viterbi_accuracy']

    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
Esempio n. 3
0
def main():
    start_time = time.perf_counter()
    print('Starting:', config)

    # Loading the corpus
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = \
            datasets.load_ud_en_ewt()
    else:  # PTB
        if VILDE:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos(
                    BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/')
        else:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos()

    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(dev_sentences)
    test_dict = conll_dict.transform(test_sentences)

    X_train_cat, Y_train_cat = build_sequences(train_dict)
    X_dev_cat, Y_dev_cat = build_sequences(dev_dict)
    print('First sentence, words', X_train_cat[0])
    print('First sentence, POS', Y_train_cat[0])

    # We collect the words, parts of speech and we create the indices
    vocabulary_words = sorted(
        set([word for sentence in X_train_cat for word in sentence]))
    print('#', len(vocabulary_words), 'words')

    # The embedding matrix, we collect the words in the file
    if VILDE:
        embeddings_dict = datasets.load_glove_vectors(
            BASE_DIR='/home/pierre/Cours/EDAN20/corpus/')
    else:
        embeddings_dict = datasets.load_glove_vectors()
    embeddings_words = embeddings_dict.keys()
    print('Words in embedding file:', len(embeddings_dict.keys()))
    vocabulary_words = sorted(set(vocabulary_words + list(embeddings_words)))
    print('# unique words in the vocabulary: embeddings and corpus:',
          len(vocabulary_words))

    pos = sorted(set([pos for sentence in Y_train_cat for pos in sentence]))
    NB_CLASSES = len(pos)
    print('#', NB_CLASSES, 'Parts of speech:', pos)

    # We create the indexes
    # We start at two to make provision for
    # the padding symbol 0 in RNN and LSTMs and unknown words, 1
    idx2word = dict(enumerate(vocabulary_words, start=2))
    idx2pos = dict(enumerate(pos, start=1))
    word2idx = {v: k for k, v in idx2word.items()}
    pos2idx = {v: k for k, v in idx2pos.items()}
    print('word index:', list(word2idx.items())[:10])
    print('POS index:', list(pos2idx.items())[:10])

    # We create the parallel sequences of indexes
    X_train_idx = to_index(X_train_cat, word2idx)
    Y_train_idx = to_index(Y_train_cat, pos2idx)
    X_dev_idx = to_index(X_dev_cat, word2idx)
    Y_dev_idx = to_index(Y_dev_cat, pos2idx)
    print('First sentences, word indices', X_train_idx[:3])
    print('First sentences, POS indices', Y_train_idx[:3])

    X_train = pad_sequences(X_train_idx)
    Y_train = pad_sequences(Y_train_idx)
    X_dev = pad_sequences(X_dev_idx)
    Y_dev = pad_sequences(Y_dev_idx)
    print('Padded X:', X_train[0])
    print('Padded Y:', Y_train[0])

    # The number of POS classes, and 0 (padding symbol)
    Y_train = to_categorical(Y_train, num_classes=len(pos) + 1)
    Y_dev = to_categorical(Y_dev, num_classes=len(pos) + 1)
    print('Padded categorical Y:', Y_train[0])

    np.random.seed(1234567)
    embedding_matrix = np.random.uniform(
        -0.05, 0.05,
        (len(vocabulary_words) + 2, EMBEDDING_DIM)).astype(np.float32)
    # We initialize the matrix with embeddings
    for word in vocabulary_words:
        if word in embeddings_dict:
            # If the words are in the embeddings,
            # we fill them with a value
            embedding_matrix[word2idx[word]] = embeddings_dict[word]
    # print('Embedding:', embedding_matrix)
    print('Shape of embedding matrix:', embedding_matrix.shape)
    print('Embedding of table', embedding_matrix[word2idx['table']])
    print('Embedding of the padding symbol, idx 0, random numbers',
          embedding_matrix[0])

    if TYPE == 'RNN' and OUTPUT_LAYER == 'DENSE':
        model = build_model_rnn(vocabulary_words,
                                embedding_matrix,
                                EMBEDDING_DIM,
                                NB_CLASSES,
                                unit_multiplier=UNIT_MULTIPLIER,
                                input_dropout=INPUT_DROPOUT,
                                dropout=DROPOUT,
                                recurrent_dropout=RECURRENT_DROPOUT,
                                ouptput_dropout=OUTPUT_DROPOUT,
                                optimizer=OPTIMIZER)
    elif TYPE == 'LSTM' and OUTPUT_LAYER == 'DENSE':
        model = build_model_lstm(vocabulary_words,
                                 embedding_matrix,
                                 EMBEDDING_DIM,
                                 NB_CLASSES,
                                 unit_multiplier=UNIT_MULTIPLIER,
                                 input_dropout=INPUT_DROPOUT,
                                 dropout=DROPOUT,
                                 recurrent_dropout=RECURRENT_DROPOUT,
                                 ouptput_dropout=OUTPUT_DROPOUT,
                                 optimizer=OPTIMIZER)
    elif TYPE == 'RNN' and OUTPUT_LAYER == 'CRF':
        model = build_model_rnn_crf(vocabulary_words,
                                    embedding_matrix,
                                    EMBEDDING_DIM,
                                    NB_CLASSES,
                                    unit_multiplier=UNIT_MULTIPLIER,
                                    input_dropout=INPUT_DROPOUT,
                                    dropout=DROPOUT,
                                    recurrent_dropout=RECURRENT_DROPOUT,
                                    ouptput_dropout=OUTPUT_DROPOUT,
                                    optimizer=OPTIMIZER)
    elif TYPE == 'LSTM' and OUTPUT_LAYER == 'CRF':
        model = build_model_lstm_crf(vocabulary_words,
                                     embedding_matrix,
                                     EMBEDDING_DIM,
                                     NB_CLASSES,
                                     unit_multiplier=UNIT_MULTIPLIER,
                                     input_dropout=INPUT_DROPOUT,
                                     dropout=DROPOUT,
                                     recurrent_dropout=RECURRENT_DROPOUT,
                                     ouptput_dropout=OUTPUT_DROPOUT,
                                     optimizer=OPTIMIZER)
    model.summary()

    if OUTPUT_LAYER == 'DENSE':
        MONITOR = 'val_acc'
    else:
        MONITOR = 'val_crf_viterbi_accuracy'

    # Callback to stop when the validation score does not increase
    # and keep the best model
    callback_lists = [
        callbacks.EarlyStopping(monitor=MONITOR,
                                patience=PATIENCE,
                                restore_best_weights=True)
    ]
    # Fitting the model
    history = model.fit(X_train,
                        Y_train,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=callback_lists,
                        validation_data=(X_dev, Y_dev))

    # In X_dict, we replace the words with their index
    X_test_cat, Y_test_cat = build_sequences(test_dict)

    # We create the parallel sequences of indexes
    X_test_idx = to_index(X_test_cat, word2idx)
    Y_test_idx = to_index(Y_test_cat, pos2idx)
    print('X[0] test idx', X_test_idx[0])
    print('Y[0] test idx', Y_test_idx[0])

    X_test_padded = pad_sequences(X_test_idx)
    Y_test_padded = pad_sequences(Y_test_idx)
    print('X[0] test idx padded', X_test_padded[0])
    print('Y[0] test idx padded', Y_test_padded[0])

    # One extra symbol for 0 (padding)
    Y_test_padded_vectorized = to_categorical(Y_test_padded,
                                              num_classes=len(pos) + 1)
    print('Y[0] test idx padded vectorized', Y_test_padded_vectorized[0])

    print(X_test_padded.shape)
    print(Y_test_padded_vectorized.shape)

    # Evaluates the model
    test_loss, test_acc = model.evaluate(X_test_padded,
                                         Y_test_padded_vectorized,
                                         batch_size=BATCH_SIZE)
    print('Configuration', config)
    print('Batch evaluation')
    print('Loss:', test_loss)
    print('Accuracy:', test_acc)
    print('Time:', (time.perf_counter() - start_time) / 60)

    print('Evaluation of padded sentences')
    Y_test_pred = predict_padded_testset(X_test_cat, model, word2idx, idx2pos)
    total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat,
                                                  Y_test_pred, word2idx)
    print('total %d, correct %d, accuracy %f' %
          (total, correct, correct / total))
    if total_ukn != 0:
        print('total unknown %d, correct %d, accuracy %f' %
              (total_ukn, correct_ukn, correct_ukn / total_ukn))

    print('Evaluation of individual sentences')
    Y_test_pred = [
        predict_wordlist(x, model, word2idx, idx2pos) for x in X_test_cat
    ]
    total, correct, total_ukn, correct_ukn = eval(X_test_cat, Y_test_cat,
                                                  Y_test_pred, word2idx)
    print('total %d, correct %d, accuracy %f' %
          (total, correct, correct / total))
    if total_ukn != 0:
        print('total unknown %d, correct %d, accuracy %f' %
              (total_ukn, correct_ukn, correct_ukn / total_ukn))

    # Tagging a few sentences
    sentences = [
        'That round table might collapse .',
        'The man can learn well .',
        'The man can swim .',
        'The man can simwo .',
        'that round table might collapsex',
    ]
    for sentence in sentences:
        y_test_pred_cat = predict_sentence(sentence.lower(), model, word2idx,
                                           idx2pos)
        print(sentence)
        print(y_test_pred_cat)

    # Show the training curves
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    if OUTPUT_LAYER == 'DENSE':
        acc = history.history['acc']
        val_acc = history.history['val_acc']
    elif OUTPUT_LAYER == 'CRF':
        acc = history.history['crf_viterbi_accuracy']
        val_acc = history.history['val_crf_viterbi_accuracy']

    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
Esempio n. 4
0
    """
    column_names = ['id', 'form']
    sentence = list(enumerate(sentence.split(), start=1))
    conll_cols = ''
    for tuple in sentence:
        conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n'

    conll_dict = CoNLLDictorizer(column_names)
    sent_dict = conll_dict.transform(conll_cols)
    return sent_dict[0]


if __name__ == '__main__':
    start_time = time.clock()
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt(
        )
    else:
        train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos(
        )

    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    print(train_dict[0])

    context_dictorizer = ContextDictorizer()
    context_dictorizer.fit(train_dict)
    # Feature and response extraction
    X_dict, y = context_dictorizer.transform(train_dict)
    print(X_dict[0])
    print(y[0])
def main():
    # Loading the embeddings
    if VILDE:
        embeddings_dict = datasets.load_glove_vectors(
            BASE_DIR='/home/pierre/Cours/EDAN20/corpus/')
    else:
        embeddings_dict = datasets.load_glove_vectors()
    print('Embeddings table:', embeddings_dict['table'])

    # Loading the corpus
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = \
            datasets.load_ud_en_ewt()
    else:  # PTB
        if VILDE:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos(
                    BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/')
        else:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos()

    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(dev_sentences)
    if MINI_CORPUS:
        train_dict = train_dict[:len(train_dict) // 15]
    test_dict = conll_dict.transform(test_sentences)
    print('First sentence, train:', train_dict[0])

    context_dictorizer = ContextDictorizer()
    context_dictorizer.fit(train_dict)
    X_dict_train, y_cat_train = context_dictorizer.transform(train_dict)
    X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict)

    corpus_words = [value for x in X_dict_train for value in x.values()]
    corpus_words = sorted(set(corpus_words))
    print('# unique words seen in training corpus:', len(corpus_words))

    embeddings_words = embeddings_dict.keys()
    print('Words in GloVe:', len(embeddings_dict.keys()))
    vocabulary_words = set(corpus_words + list(embeddings_words))

    # We start at 1, 0 is the unknown word
    idx2word = dict(enumerate(vocabulary_words, start=1))
    word2idx = {v: k for k, v in idx2word.items()}
    cnt_uniq = len(vocabulary_words) + 1
    print('# unique words in the vocabulary: embeddings and corpus:',
          len(vocabulary_words) + 1)

    for x_dict in X_dict_train:
        for word in x_dict:
            x_dict[word] = word2idx[x_dict[word]]
    for x_dict in X_dict_dev:
        for word in x_dict:
            x_dict[word] = word2idx.get(x_dict[word], 0)

    np.random.seed(0)
    dict_vectorizer = DictVectorizer(sparse=False)
    X = dict_vectorizer.fit_transform(X_dict_train)
    X_dev = dict_vectorizer.transform(X_dict_dev)

    print('X shape', X.shape)
    print('First line of X:', X[0])

    # The POS and the number of different POS
    pos_list = sorted(set(y_cat_train))
    NB_CLASSES = len(pos_list)

    # We build a part-of-speech index.
    idx2pos = dict(enumerate(pos_list))
    pos2idx = {v: k for k, v in idx2pos.items()}

    # We encode y
    y = [pos2idx[i] for i in y_cat_train]
    print(y_cat_train[:10])
    y_dev = [pos2idx[i] for i in y_cat_dev]

    embedding_matrix = np.random.random((cnt_uniq, EMBEDDING_DIM))
    # Same init as with Keras (-0.05, 0.05)
    # embedding_matrix = (embedding_matrix - 0.5) / 10.0

    for word in vocabulary_words:
        if word in embeddings_dict:
            # If the words are in the pretrained embeddings,
            # we fill them with this embedding value
            embedding_matrix[word2idx[word]] = embeddings_dict[word]

    model = models.Sequential()
    model.add(
        layers.Embedding(cnt_uniq,
                         EMBEDDING_DIM,
                         weights=[embedding_matrix],
                         trainable=True,
                         input_length=2 * W_SIZE + 1))
    model.add(layers.Flatten())
    model.add(layers.Dense(NB_CLASSES, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=OPTIMIZER,
                  metrics=['accuracy'])
    model.summary()

    # Fitting the model
    callback_lists = [
        callbacks.EarlyStopping(monitor='val_acc',
                                patience=2,
                                restore_best_weights=True)
    ]
    # Callback to stop when the validation score does not increase
    # and keep the best model
    history = model.fit(X,
                        y,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=callback_lists,
                        validation_data=(X_dev, y_dev))

    X_test_dict, y_test_cat = context_dictorizer.transform(test_dict)
    for x_dict_test in X_test_dict:
        for word in x_dict_test:
            x_dict_test[word] = word2idx.get(x_dict_test[word], 0)

    # We transform the symbols into numbers
    X_test = dict_vectorizer.transform(X_test_dict)
    y_test = [pos2idx.get(i, 0) for i in y_test_cat]

    test_loss, test_acc = model.evaluate(X_test, y_test)
    print('Configuration:', config)
    print('Loss:', test_loss)
    print('Accuracy:', test_acc)

    # Evaluation on the test set
    total = 0
    correct = 0
    print('#Sentences', len(test_dict))
    for sentence in test_dict:
        y_pred = predict_sentence(sentence, model, context_dictorizer,
                                  dict_vectorizer, word2idx, idx2pos)
        for y in y_pred:
            total += 1
            if y['pos'] == y['ppos']:
                correct += 1
    print('total %d, correct %d, accuracy %f' %
          (total, correct, correct / total))

    # Tag some sentences
    sentences = [
        'That round table might collapse .', 'The man can learn well .',
        'The man can swim .', 'The man can simwo .',
        'that round table might collapsex'
    ]

    for sentence in sentences:
        sent_dict = sentence_to_conll(sentence.lower())
        sent_dict = predict_sentence(sent_dict, model, context_dictorizer,
                                     dict_vectorizer, word2idx, idx2pos)
        print([word['form'] for word in sent_dict])
        print([word['ppos'] for word in sent_dict])

    # Show the training curves
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()
Esempio n. 6
0
    """
    column_names = ['id', 'form']
    sentence = list(enumerate(sentence.split(), start=1))
    conll_cols = ''
    for tuple in sentence:
        conll_cols += str(tuple[0]) + '\t' + tuple[1] + '\n'

    conll_dict = CoNLLDictorizer(column_names)
    sent_dict = conll_dict.transform(conll_cols)
    return sent_dict[0]


if __name__ == '__main__':
    start_time = time.clock()
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = datasets.load_ud_en_ewt()
    else:
        train_sentences, dev_sentences, test_sentences, column_names = datasets.load_conll2009_pos()

    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    print(train_dict[0])

    context_dictorizer = ContextDictorizer()
    context_dictorizer.fit(train_dict)
    # Feature and response extraction
    X_dict, y = context_dictorizer.transform(train_dict)
    print(X_dict[0])
    print(y[0])

    # We print the features to check they match Table 8.1 in my book (second edition)
Esempio n. 7
0
def main():
    start_time = time.perf_counter()

    # Loading the corpus
    if CORPUS == 'EWT':
        train_sentences, dev_sentences, test_sentences, column_names = \
            datasets.load_ud_en_ewt()
    else:  # PTB
        if VILDE:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos(
                    BASE_DIR='/home/pierre/Cours/EDAN20/corpus/conll2009/')
        else:
            train_sentences, dev_sentences, test_sentences, column_names = \
                datasets.load_conll2009_pos()

    # Convert the corpus in a dictionary
    conll_dict = CoNLLDictorizer(column_names)
    train_dict = conll_dict.transform(train_sentences)
    dev_dict = conll_dict.transform(dev_sentences)
    if MINI_CORPUS:
        train_dict = train_dict[:len(train_dict) // 10]
    test_dict = conll_dict.transform(test_sentences)

    # Extract the context and dictorize it
    context_dictorizer = ContextDictorizer()
    context_dictorizer.fit(train_dict)
    X_dict_train, y_cat_train = context_dictorizer.transform(train_dict)
    X_dict_dev, y_cat_dev = context_dictorizer.transform(dev_dict)

    # Transform the X symbols into numbers
    dict_vectorizer = DictVectorizer()
    X_num_train = dict_vectorizer.fit_transform(X_dict_train)
    X_num_dev = dict_vectorizer.transform(X_dict_dev)

    scaler = None
    if SCALER:
        # Standardize X_num
        scaler = StandardScaler(with_mean=False)
        X = scaler.fit_transform(X_num_train)
        X_dev = scaler.transform(X_num_dev)
    else:
        X = X_num_train
        X_dev = X_num_dev

    # Vectorizing y
    # The POS and the number of different POS
    pos_list = sorted(set(y_cat_train))
    NB_CLASSES = len(pos_list)

    # We build a part-of-speech index.
    idx2pos = dict(enumerate(pos_list))
    pos2idx = {v: k for k, v in idx2pos.items()}

    # We encode y. We assign unknown parts of speech to 0 in the test set
    y = [pos2idx[i] for i in y_cat_train]
    y_dev = [pos2idx.get(i, 0) for i in y_cat_dev]

    # The tagger
    np.random.seed(0)
    model = build_model(X.shape[1],
                        NB_CLASSES,
                        num_layers=NUM_LAYERS,
                        dropout=DROPOUT)

    # Callback to stop when the validation score does not increase
    # and keep the best model
    callback_lists = [
        callbacks.EarlyStopping(
            monitor='val_acc',
            patience=2,
            restore_best_weights=True
        )
    ]
    # Fitting the model
    history = model.fit(X, y,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=callback_lists,
                        validation_data=(X_dev, y_dev))
    if SAVE_MODEL:
        model.save(config + '.h5')

    # Formatting the test set
    X_test_dict, y_test_cat = context_dictorizer.transform(test_dict)

    # We transform the symbols into numbers
    X_test_num = dict_vectorizer.transform(X_test_dict)
    if scaler:
        X_test = scaler.transform(X_test_num)
    else:
        X_test = X_test_num
    y_test = [pos2idx.get(i, 0) for i in y_test_cat]

    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test)

    print('Configuration', config)
    print('Loss:', test_loss)
    print('Accuracy:', test_acc)
    print('Time:', (time.perf_counter() - start_time) / 60)

    # Evaluation on the test set
    total = 0
    correct = 0
    print('#Sentences', len(test_dict))
    for sentence in test_dict:
        y_pred = predict_sentence(sentence,
                                  model,
                                  context_dictorizer,
                                  dict_vectorizer,
                                  scaler,
                                  idx2pos)
        for y in y_pred:
            total += 1
            if y['pos'] == y['ppos']:
                correct += 1
    print('total %d, correct %d, accuracy %f' % (total, correct, correct / total))

    # Tag some sentences
    sentences = ['That round table might collapse .',
                 'The man can learn well .',
                 'The man can swim .',
                 'The man can simwo .',
                 'that round table might collapsex']
    for sentence in sentences:
        sent_dict = sentence_to_conll(sentence.lower())
        y_test_pred_cat = predict_sentence(sent_dict,
                                           model,
                                           context_dictorizer,
                                           dict_vectorizer,
                                           scaler,
                                           idx2pos)
        print([y['form'] for y in y_test_pred_cat])
        print([y['ppos'] for y in y_test_pred_cat])

    # Show the training curves
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    acc = history.history['acc']
    val_acc = history.history['val_acc']

    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.figure()
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()