word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)

wordEmbeddings = np.array(wordEmbeddings)

# second, prepare CoNLL format training and validation dataset
print('Processing the training dataset')

train_set = createMatrices(trainSentences, word2Idx, label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}
print(idx2Label)
train_batch, train_batch_len = createBatches(train_set)

np.save("outputs/idx2Label.npy", idx2Label)
np.save("outputs/word2Idx.npy", word2Idx)

## Build the LSTM model

### Word input
words_input_int = Input(shape=(None, ), dtype='int32', name='words_input1')

### Word Embedding layer
words = Embedding(input_dim=wordEmbeddings.shape[0],
                  output_dim=wordEmbeddings.shape[1],
Beispiel #2
0
def main():

    logging.basicConfig(format='%(asctime)s - %(levelname)s -   %(message)s',
                        datefmt='%m/%d/%Y ',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--data",
                        default=None,
                        type=str,
                        required=True,
                        help="Directory which has the data files for the task")
    parser.add_argument(
        "--output",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--overwrite",
                        default=False,
                        type=bool,
                        help="Set it to True to overwrite output directory")

    args = parser.parse_args()

    if os.path.exists(args.output) and os.listdir(
            args.output) and not args.overwrite:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Set the overwrite flag to overwrite"
            .format(args.output))
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    train_batch_size = 32
    valid_batch_size = 64
    test_batch_size = 64

    # padding sentences and labels to max_length of 128
    max_seq_len = 128
    EMBEDDING_DIM = 100
    epochs = 10

    split_train = split_text_label(os.path.join(args.data, "train.txt"))
    split_valid = split_text_label(os.path.join(args.data, "valid.txt"))
    split_test = split_text_label(os.path.join(args.data, "test.txt"))

    labelSet = set()
    wordSet = set()
    # words and labels
    for data in [split_train, split_valid, split_test]:
        for labeled_text in data:
            for word, label in labeled_text:
                labelSet.add(label)
                wordSet.add(word.lower())

    # Sort the set to ensure '0' is assigned to 0
    sorted_labels = sorted(list(labelSet), key=len)

    # Create mapping for labels
    label2Idx = {}
    for label in sorted_labels:
        label2Idx[label] = len(label2Idx)

    num_labels = len(label2Idx)
    idx2Label = {v: k for k, v in label2Idx.items()}

    pickle.dump(idx2Label,
                open(os.path.join(args.output, "idx2Label.pkl"), 'wb'))
    logger.info("Saved idx2Label pickle file")

    # Create mapping for words
    word2Idx = {}
    if len(word2Idx) == 0:
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
    for word in wordSet:
        word2Idx[word] = len(word2Idx)
    logger.info("Total number of words is : %d ", len(word2Idx))

    pickle.dump(word2Idx, open(os.path.join(args.output, "word2Idx.pkl"),
                               'wb'))
    logger.info("Saved word2Idx pickle file")

    # Loading glove embeddings
    embeddings_index = {}
    f = open('embeddings/glove.6B.100d.txt', encoding="utf-8")
    for line in f:
        values = line.strip().split(' ')
        word = values[0]  # the first entry is the word
        coefs = np.asarray(
            values[1:], dtype='float32')  #100d vectors representing the word
        embeddings_index[word] = coefs
    f.close()
    logger.info("Glove data loaded")

    #print(str(dict(itertools.islice(embeddings_index.items(), 2))))

    embedding_matrix = np.zeros((len(word2Idx), EMBEDDING_DIM))

    # Word embeddings for the tokens
    for word, i in word2Idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    pickle.dump(embedding_matrix,
                open(os.path.join(args.output, "embedding.pkl"), 'wb'))
    logger.info("Saved Embedding matrix pickle")

    # Interesting - to check how many words were not there in Glove Embedding
    # indices = np.where(np.all(np.isclose(embedding_matrix, 0), axis=1))
    # print(len(indices[0]))

    train_sentences, train_labels = createMatrices(split_train, word2Idx,
                                                   label2Idx)
    valid_sentences, valid_labels = createMatrices(split_valid, word2Idx,
                                                   label2Idx)
    test_sentences, test_labels = createMatrices(split_test, word2Idx,
                                                 label2Idx)

    train_features, train_labels = padding(train_sentences,
                                           train_labels,
                                           max_seq_len,
                                           padding='post')
    valid_features, valid_labels = padding(valid_sentences,
                                           valid_labels,
                                           max_seq_len,
                                           padding='post')
    test_features, test_labels = padding(test_sentences,
                                         test_labels,
                                         max_seq_len,
                                         padding='post')

    logger.info(
        f"Train features shape is {train_features.shape} and labels shape is{train_labels.shape}"
    )
    logger.info(
        f"Valid features shape is {valid_features.shape} and labels shape is{valid_labels.shape}"
    )
    logger.info(
        f"Test features shape is {test_features.shape} and labels shape is{test_labels.shape}"
    )

    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_features, train_labels))
    valid_dataset = tf.data.Dataset.from_tensor_slices(
        (valid_features, valid_labels))
    test_dataset = tf.data.Dataset.from_tensor_slices(
        (test_features, test_labels))

    shuffled_train_dataset = train_dataset.shuffle(
        buffer_size=train_features.shape[0], reshuffle_each_iteration=True)

    batched_train_dataset = shuffled_train_dataset.batch(train_batch_size,
                                                         drop_remainder=True)
    batched_valid_dataset = valid_dataset.batch(valid_batch_size,
                                                drop_remainder=True)
    batched_test_dataset = test_dataset.batch(test_batch_size,
                                              drop_remainder=True)

    epoch_bar = master_bar(range(epochs))
    train_pb_max_len = math.ceil(
        float(len(train_features)) / float(train_batch_size))
    valid_pb_max_len = math.ceil(
        float(len(valid_features)) / float(valid_batch_size))
    test_pb_max_len = math.ceil(
        float(len(test_features)) / float(test_batch_size))

    model = TFNer(max_seq_len=max_seq_len,
                  embed_input_dim=len(word2Idx),
                  embed_output_dim=EMBEDDING_DIM,
                  weights=[embedding_matrix],
                  num_labels=num_labels)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    train_log_dir = f"{args.output}/logs/train"
    valid_log_dir = f"{args.output}/logs/valid"
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    valid_summary_writer = tf.summary.create_file_writer(valid_log_dir)

    train_loss_metric = tf.keras.metrics.Mean('training_loss',
                                              dtype=tf.float32)
    valid_loss_metric = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32)

    def train_step_fn(sentences_batch, labels_batch):
        with tf.GradientTape() as tape:
            logits = model(
                sentences_batch)  # batchsize, max_seq_len, num_labels
            loss = scce(labels_batch, logits)  #batchsize,max_seq_len
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
        return loss, logits

    def valid_step_fn(sentences_batch, labels_batch):
        logits = model(sentences_batch)
        loss = scce(labels_batch, logits)
        return loss, logits

    for epoch in epoch_bar:
        with train_summary_writer.as_default():
            for sentences_batch, labels_batch in progress_bar(
                    batched_train_dataset,
                    total=train_pb_max_len,
                    parent=epoch_bar):

                loss, logits = train_step_fn(sentences_batch, labels_batch)
                train_loss_metric(loss)
                epoch_bar.child.comment = f'training loss : {train_loss_metric.result()}'
            tf.summary.scalar('training loss',
                              train_loss_metric.result(),
                              step=epoch)
            train_loss_metric.reset_states()

        with valid_summary_writer.as_default():
            for sentences_batch, labels_batch in progress_bar(
                    batched_valid_dataset,
                    total=valid_pb_max_len,
                    parent=epoch_bar):
                loss, logits = valid_step_fn(sentences_batch, labels_batch)
                valid_loss_metric.update_state(loss)

                epoch_bar.child.comment = f'validation loss : {valid_loss_metric.result()}'

            # Logging after each Epoch !
            tf.summary.scalar('valid loss',
                              valid_loss_metric.result(),
                              step=epoch)
            valid_loss_metric.reset_states()

    model.save_weights(f"{args.output}/model_weights", save_format='tf')
    logger.info(f"Model weights saved")

    #Evaluating on test dataset

    test_model = TFNer(max_seq_len=max_seq_len,
                       embed_input_dim=len(word2Idx),
                       embed_output_dim=EMBEDDING_DIM,
                       weights=[embedding_matrix],
                       num_labels=num_labels)
    test_model.load_weights(f"{args.output}/model_weights")
    logger.info(f"Model weights restored")

    true_labels = []
    pred_labels = []

    for sentences_batch, labels_batch in progress_bar(batched_test_dataset,
                                                      total=test_pb_max_len):

        logits = test_model(sentences_batch)
        temp1 = tf.nn.softmax(logits)
        preds = tf.argmax(temp1, axis=2)
        true_labels.append(np.asarray(labels_batch))
        pred_labels.append(np.asarray(preds))

    label_correct, label_pred = idx_to_label(pred_labels, true_labels,
                                             idx2Label)
    report = classification_report(label_correct, label_pred, digits=4)
    logger.info(f"Results for the test dataset")
    logger.info(f"\n{report}")
Beispiel #3
0
        vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)

wordEmbeddings = np.array(wordEmbeddings)

char2Idx = {"PADDING": 0, "UNKNOWN": 1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|£’£—»‘–”“…ü®°¬Ã¢∝é🔊ÁãÉfi閏™\xad":  # Handle stopwords
    char2Idx[c] = len(char2Idx)

train_set = padding(
    createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx))
dev_set = padding(
    createMatrices(devSentences, word2Idx, label2Idx, case2Idx, char2Idx))
test_set = padding(
    createMatrices(testSentences, word2Idx, label2Idx, case2Idx, char2Idx))

idx2Label = {v: k for k, v in label2Idx.items()}
np.save("model/idx2Label.npy", idx2Label)  # output weight path
np.save("model/word2Idx.npy", word2Idx)

train_batch, train_batch_len = createBatches(train_set)
dev_batch, dev_batch_len = createBatches(dev_set)
test_batch, test_batch_len = createBatches(test_set)

words_input = Input(shape=(None, ), dtype='int32', name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0],
Beispiel #4
0
def main():

    logging.basicConfig(format='%(asctime)s - %(levelname)s -  %(message)s',
                        datefmt='%m/%d/%Y ',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--data",
                        default=None,
                        type=str,
                        required=True,
                        help="Directory which has the data files for the task")
    parser.add_argument(
        "--model_dir",
        default=None,
        type=str,
        required=True,
        help="Directory which has the model files for the task")
    parser.add_argument(
        "--predsingle",
        default=False,
        type=str,
        required=False,
        help="Set it to True and assign the sentence to test_sentence variable."
    )

    args = parser.parse_args()

    test_batch_size = 64

    # padding sentences and labels to max_length of 128
    max_seq_len = 128
    EMBEDDING_DIM = 100
    test_sentence = "Steve went to Paris"
    idx2Label = pickle.load(
        open(os.path.join(args.model_dir, "idx2Label.pkl"), 'rb'))
    label2Idx = {v: k for k, v in idx2Label.items()}
    num_labels = len(label2Idx)
    word2Idx = pickle.load(
        open(os.path.join(args.model_dir, "word2Idx.pkl"), 'rb'))
    embedding_matrix = pickle.load(
        open(os.path.join(args.model_dir, "embedding.pkl"), 'rb'))
    logger.info("Loaded idx2Label, word2Idx and Embedding matrix pickle files")

    #Loading the model
    testmodel = TFNer(max_seq_len=max_seq_len,
                      embed_input_dim=len(word2Idx),
                      embed_output_dim=EMBEDDING_DIM,
                      weights=[embedding_matrix],
                      num_labels=num_labels)
    testmodel.load_weights(f"{args.model_dir}/model_weights")
    logger.info("Model weights restored")

    if not args.predsingle:
        #Evaluating on test dataset
        split_test = split_text_label(os.path.join(args.data, "test.txt"))
        test_sentences, test_labels = createMatrices(split_test, word2Idx,
                                                     label2Idx)
        test_features, test_labels = padding(test_sentences,
                                             test_labels,
                                             max_seq_len,
                                             padding='post')
        logger.info(
            f"Test features shape is {test_features.shape} and labels shape is{test_labels.shape}"
        )
        test_dataset = tf.data.Dataset.from_tensor_slices(
            (test_features, test_labels))
        batched_test_dataset = test_dataset.batch(test_batch_size,
                                                  drop_remainder=True)

        #epoch_bar = master_bar(range(epochs))
        test_pb_max_len = math.ceil(
            float(len(test_features)) / float(test_batch_size))

        true_labels = []
        pred_labels = []

        for sentences_batch, labels_batch in progress_bar(
                batched_test_dataset, total=test_pb_max_len):

            logits = testmodel(sentences_batch)
            temp1 = tf.nn.softmax(logits)
            preds = tf.argmax(temp1, axis=2)
            true_labels.append(np.asarray(labels_batch))
            pred_labels.append(np.asarray(preds))

        label_correct, label_pred = idx_to_label(pred_labels, true_labels,
                                                 idx2Label)
        report = classification_report(label_correct, label_pred, digits=4)
        logger.info(f"\nResults for the test dataset")
        logger.info(f"\n{report}")

    else:
        length, masks, padded_inputs = predict_single_sentence(
            test_sentence, word2Idx, max_seq_len)
        padded_inputs = tf.expand_dims(padded_inputs, 0)

        true_labels = None
        pred_labels = []
        pred_logits = []

        for sentence in padded_inputs:
            logits = testmodel(sentence)
            temp1 = tf.nn.softmax(logits)
            max_values = tf.reduce_max(temp1, axis=-1)

            masked_max_values = max_values * masks
            preds = tf.argmax(temp1, axis=2)
            pred_labels.append(np.asarray(preds))
            pred_logits.extend(np.asarray(masked_max_values))
        _, label_pred = idx_to_label(pred_labels, true_labels, idx2Label)

        logger.info(f"Results for - \"{test_sentence}\"")

        label_pred = label_pred[0][:length]
        pred_logits = pred_logits[0][:length]
        logger.info(f"Labels predicted are {label_pred}")
        logger.info(f"with a confidence of {pred_logits}")
    #======================================
    #
    # Set Bi-LSTM:
    # will each word include in addtion to word embedding also (1) case indices (2) char indices
    #
    epochs = 50
    case2Idx, char2Idx, maxlen = {}, {}, 0
    #case2Idx,maxlen = {}, 52

    #caseEmbeddings = None
    if len(case2Idx) > 0:
        caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

    # create input matrices: each word is a vector
    train_set = createMatrices(trainSentences, word2Idx, label2Idx, case2Idx,
                               char2Idx)

    train_set = padding(train_set, maxlen=maxlen)
    dev_set = padding(createMatrices(devSentences, word2Idx, label2Idx,
                                     case2Idx, char2Idx),
                      maxlen=maxlen)
    test_set = padding(createMatrices(testSentences, word2Idx, label2Idx,
                                      case2Idx, char2Idx),
                       maxlen=maxlen)

    # batches of sentences of equal lengths
    train_batch, train_batch_len, train_batch_sentrences = createBatches(
        train_set, trainSentences)
    dev_batch, dev_batch_len, dev_batch_sentrences = createBatches(
        dev_set, devSentences)
    test_batch, test_batch_len, test_batch_sentences = createBatches(